From 86f8cd04af1c955af67e6e0f68a21d92ee917b23 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 15 May 2024 11:09:04 -0600 Subject: [PATCH 01/29] configure.ac stuff for cJSON [skip ci] --- configure.ac | 227 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 158 insertions(+), 69 deletions(-) diff --git a/configure.ac b/configure.ac index f05b4cee0..1bc4ca95e 100644 --- a/configure.ac +++ b/configure.ac @@ -74,6 +74,74 @@ AC_CONFIG_FILES([Makefile test/Makefile]) +### Our macros ###################################################### + +# Macro to validate executable versions. Arguments: +# +# $1 name of variable containing executable name or absolute path +# $2 minimum version +# $3 append to $1 to make shell pipeline to get actual version only +# (e.g., without program name) +# +# This macro is not able to determine if a program exists, only whether its +# version is sufficient. ${!1} (i.e, the value of the variable whose name is +# stored in $1) must be either empty, an absolute path to an executable, or +# the name of a program in $PATH. A prior macro such as AX_WITH_PROG can be +# used to ensure this condition. +# +# If ${!1} is an absolute path, and that file isn’t executable, error out. If +# it’s something other than an absolute path, assume it’s the name of a +# program in $PATH; if not, the behavior is undefined but not good (FIXME). +# +# Post-conditions: +# +# 1. If ${!1} is non-empty and the version reported by the program is +# greater than or equal to the minimum, ${!1} is unchanged. If ${!1} is +# empty or reported version is insufficient, ${!1} is the empty string. +# This lets you test version sufficiency by whether ${!1} is empty. +# +# 2. $1_VERSION_NOTE contains a brief explanatory note. +# +AC_DEFUN([CH_CHECK_VERSION], [ + AS_VAR_PUSHDEF([prog], [$1]) + AS_IF([test -n "$prog"], [ + # ${!1} is non-empty + AS_CASE([$prog], + # absolute path; check if executable + [/*], [AC_MSG_CHECKING([if $prog is executable]) + AS_IF([test -e "$prog"], + [AC_MSG_RESULT([ok])], + [AC_MSG_RESULT([no]) + AC_MSG_ERROR([must be executable])])]) + AC_MSG_CHECKING([if $prog version >= $2]) + vact=$($prog $3) + AX_COMPARE_VERSION([$2], [le], [$vact], [ + AC_SUBST([$1_VERSION_NOTE], ["ok ($vact)"]) + AC_MSG_RESULT([ok ($vact)]) + ], [ + AC_SUBST([$1_VERSION_NOTE], ["too old ($vact)"]) + AC_MSG_RESULT([too old ($vact)]) + AS_UNSET([$1]) + ]) + ], [ + # ${!} is empty + AC_SUBST([$1_VERSION_NOTE], ["not found"]) + AS_UNSET([$1]) + ]) + AS_VAR_POPDEF([prog]) +]) + +# Macro to validate that $1 is a directory (or a symlink to one). If not, exit +# with error, prefixed with $2. +AC_DEFUN([CH_REQUIRE_DIR], [ + AC_MSG_CHECKING([whether $1 is a directory]) + AS_IF([test -d "$1"], + [AC_MSG_RESULT(yes)], + [AC_MSG_RESULT(no) + AC_MSG_ERROR([$2: not a directory: $1])]) +]) + + ### Options ################################################################## # Note: Variables must match option, e.g. --disable-foo-bar => enable_foo_bar. @@ -139,6 +207,36 @@ AS_CASE([$with_seccomp], [*], # anything else [AC_MSG_ERROR([invalid --with-seccomp arg: $with_seccomp])]) +AC_ARG_WITH([json], + AS_HELP_STRING([--with-json=@<:@yes|no@:>@], + [enable JSON features by linking with libcjson])) +AS_CASE([$with_json], + [yes], # --with-json=yes or --with-json + [want_json=yes + need_json=yes], + [no], # --with-json=no or --without-json + [want_json=no + need_json=no], + [''], # neither --with-json nor --without-json specified + [want_json=yes + need_json=no], + [*], # unknown argument + [AC_MSG_ERROR([--with-json: bad argument: $with_json])]) + +AC_ARG_WITH([json-include], + AS_HELP_STRING([--with-json-include=DIR], + [directory containing cJSON.h (if not in defaults)])) +AS_IF([test -n "$with_json_include"], + [inc_json=$with_json_include + CH_REQUIRE_DIR([$inc_json], [--with-json-include])]) + +AC_ARG_WITH([json-lib], + AS_HELP_STRING([--with-json-lib=DIR], + [directory containing libcjson.so (if not in defaults)])) +AS_IF([test -n "$with_json_lib"], + [lib_json=$with_json_lib + CH_REQUIRE_DIR([$lib_json], [--with-json-lib])]) + AC_ARG_WITH([libsquashfuse], AS_HELP_STRING([--with-libsquashfuse=@<:@yes|no|PATH@:>@], [whether to link with libsquashfuse])) @@ -174,78 +272,22 @@ AC_ARG_WITH([sphinx-python], [sphinx_python='']) -### Feature test macros ###################################################### - -# Macro to validate executable versions. Arguments: -# -# $1 name of variable containing executable name or absolute path -# $2 minimum version -# $3 append to $1 to make shell pipeline to get actual version only -# (e.g., without program name) -# -# This macro is not able to determine if a program exists, only whether its -# version is sufficient. ${!1} (i.e, the value of the variable whose name is -# stored in $1) must be either empty, an absolute path to an executable, or -# the name of a program in $PATH. A prior macro such as AX_WITH_PROG can be -# used to ensure this condition. -# -# If ${!1} is an absolute path, and that file isn’t executable, error out. If -# it’s something other than an absolute path, assume it’s the name of a -# program in $PATH; if not, the behavior is undefined but not good (FIXME). -# -# Post-conditions: -# -# 1. If ${!1} is non-empty and the version reported by the program is -# greater than or equal to the minimum, ${!1} is unchanged. If ${!1} is -# empty or reported version is insufficient, ${!1} is the empty string. -# This lets you test version sufficiency by whether ${!1} is empty. -# -# 2. $1_VERSION_NOTE contains a brief explanatory note. -# -AC_DEFUN([CH_CHECK_VERSION], [ - AS_VAR_PUSHDEF([prog], [$1]) - AS_IF([test -n "$prog"], [ - # ${!1} is non-empty - AS_CASE([$prog], - # absolute path; check if executable - [/*], [AC_MSG_CHECKING([if $prog is executable]) - AS_IF([test -e "$prog"], - [AC_MSG_RESULT([ok])], - [AC_MSG_RESULT([no]) - AC_MSG_ERROR([must be executable])])]) - AC_MSG_CHECKING([if $prog version >= $2]) - vact=$($prog $3) - AX_COMPARE_VERSION([$2], [le], [$vact], [ - AC_SUBST([$1_VERSION_NOTE], ["ok ($vact)"]) - AC_MSG_RESULT([ok ($vact)]) - ], [ - AC_SUBST([$1_VERSION_NOTE], ["too old ($vact)"]) - AC_MSG_RESULT([too old ($vact)]) - AS_UNSET([$1]) - ]) - ], [ - # ${!} is empty - AC_SUBST([$1_VERSION_NOTE], ["not found"]) - AS_UNSET([$1]) - ]) - AS_VAR_POPDEF([prog]) -]) - - ### C compiler ############################################################### # Need a C99 compiler. (See https://stackoverflow.com/a/28558338.) AC_PROG_CC -# Set up CFLAGS. -ch_cflags='-std=c99 -Wall' -AS_IF([test -n "$lib_libsquashfuse"], - [ch_cflags="$ch_cflags -I$inc_libsquashfuse -L$lib_libsquashfuse" - # Without this, clang fails with “error: argument unused during - # compilation” on the -L. GCC ignores it. - ch_cflags="$ch_cflags -Wno-unused-command-line-argument"]) +# Set up CFLAGS. -Wno-unused-command-line-argument is for clang, which fails +# with an error if -L is present for non-linking stages. It seemed easier to +# add it unconditionally rather than maintain conditionals about which +# compiler and which libraries. +ch_cflags='-std=c99 -Wall -Wno-unused-command-line-argument' AS_IF([test $use_werror = yes], [ch_cflags="$ch_cflags -Werror"]) +AS_IF([test -n "$inc_json"], # -L$lib_json added below + [ch_cflags="$ch_cflags -I$inc_json"]) +AS_IF([test -n "$lib_libsquashfuse"], + [ch_cflags="$ch_cflags -I$inc_libsquashfuse -L$lib_libsquashfuse"]) AX_CHECK_COMPILE_FLAG([$ch_cflags], [ CFLAGS="$CFLAGS $ch_cflags" @@ -339,6 +381,9 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([[ [AC_MSG_ERROR([cross-compilation not supported])]) AC_MSG_RESULT($have_userns) + +### ch-run optional ########################################################## + # overlayfs AC_DEFUN([CH_OVERLAY_C], [[ #define _GNU_SOURCE @@ -421,9 +466,6 @@ AS_IF([test $enable_impolite_checks = yes], [AC_MSG_ERROR([cross-compilation not supported])])]) AC_MSG_RESULT($have_tmpfs_xattrs) - -### ch-run optional ########################################################## - # FNM_EXTMATCH is a GNU extension to support extended globs in fnmatch(3). AC_CHECK_DECL(FNM_EXTMATCH, [have_fnm_extmatch=yes], @@ -431,6 +473,37 @@ AC_CHECK_DECL(FNM_EXTMATCH, [[#define _GNU_SOURCE #include ]]) +# cJSON. Note that we don’t try to ensure the header we find matches the +# library we find. Hopefully that’s not a problem. +AS_IF([test $want_json = yes], [ + AC_CHECK_LIB(cjson, cJSON_ParseWithLength, + [have_libcjson=yes, + AS_IF([test -n "$lib_json"], + [CH_RUN_LIBS="-Wl,-rpath=$lib_json $CH_RUN_LIBS"]) + CH_RUN_LIBS="-lcjson $CH_RUN_LIBS"], + [have_libcjson=no], + [$CH_RUN_LIBS]) + # The include file installs by default to “$PREFIX/include/cjson/cJSON.h”, + # but --with-json-include shouldn’t require a “cjson” subdirectory and it + # seemed impossible to document that concisely anyway. Thereforre, try both + # and define a macro. Double quotes support bundling it with Charliecloud. + AC_CHECK_HEADER([cJSON.h], + [have_cjson_h=yes + CJSON_H='"cJSON.h"'], + [AC_CHECK_HEADER([cjson/cJSON.h], + [have_cjson_h=yes + CJSON_H='"cjson/cJSON.h"'], + [CJSON_H='not found' + have_cjson_h=no])]) +], [have_libcjson=no + have_cjson_h=no]) +# Error out if needed but not found. +AS_IF([test $have_libcjson = yes && test $have_cjson_h = yes], + [have_json=yes], + [have_json=no]) +AS_IF([test $need_json = yes && test $have_json = no], + [AC_MSG_ERROR([--with-json=yes but cJSON.h not found])]) + # Should we build seccomp? AC_MSG_CHECKING([for seccomp filter support]) AC_RUN_IFELSE([AC_LANG_SOURCE([[ @@ -784,6 +857,7 @@ AS_IF([test $enable_syslog = yes], AM_CONDITIONAL([ENABLE_TEST], [test $enable_test = yes]) AC_SUBST([CH_RUN_LIBS]) +AC_SUBST([CJSON_H]) AC_SUBST([PYTHON_SHEBANG]) AC_SUBST([SPHINX]) @@ -811,6 +885,14 @@ AS_IF([ test $have_userns = yes], [have_ch_run=yes], [have_ch_run=no]) +AS_IF([ test $want_json = yes], + [libcjson_note=$have_libcjson + AS_IF([test $have_cjson_h = yes], + [cjson_h_note="yes, $CJSON_H"], + [cjson_h_note=no])], + [libcjson_note='not tested' + cjson_h_note='not tested']) + # image builders AS_IF([ test $enable_ch_image = yes \ @@ -946,11 +1028,18 @@ Building Charliecloud test suite ... ${enable_test} required: - C99 compiler ... ${CC} ${CFLAGS} + C99 compiler ... ${CC} + \$CFLAGS ... ${CFLAGS} + ch-run(1) library args ... ${CH_RUN_LIBS} optional: extended glob patterns in --unset-env ... ${have_fnm_extmatch} + JSON features: ${have_json} + enabled ... ${want_json} + libcjson ... ${libcjson_note} + cJSON.h ... ${cjson_h_note} + ch-run(1) internal SquashFS mounting: ${have_libsquashfuse} enabled ... ${want_libsquashfuse} libfuse3 ... ${have_libfuse3} ${fuse3_CFLAGS:-} From 5c575a07ac5bd3d7b9544c2aab79c9a8aa0c63a4 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Mon, 20 May 2024 16:42:14 -0600 Subject: [PATCH 02/29] docs draft [skip ci] --- doc/cdi-nvidia.json | 36 ++++++++ doc/ch-run.rst | 206 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 doc/cdi-nvidia.json diff --git a/doc/cdi-nvidia.json b/doc/cdi-nvidia.json new file mode 100644 index 000000000..0bc419d09 --- /dev/null +++ b/doc/cdi-nvidia.json @@ -0,0 +1,36 @@ +{ + "cdiVersion": "0.5.0", + "kind": "nvidia.com/gpu", + "devices": [ { + "name": "foo", + "containerEdits": { + "deviceNodes": [ { "path": "/dev/nvidia0" }, + { "path": "/dev/dri/card0" } ], + "hooks": [ { "hookName": "createContainer", + "path": "/usr/bin/nvidia-ctk", + "args": [ "nvidia-ctk", + "hook", "create-symlinks", + "--link", "../card0::/dev/dri/by-path/pci-0000:07:00.0-card", + ] } ] } } ] } + "containerEdits": { + "env": [ "NVIDIA_VISIBLE_DEVICES=void" ], + "deviceNodes": [ { "path": "/dev/nvidia-modeset" }, + { "path": "/dev/nvidiactl" } ], + "mounts": [ + { "hostPath": "/run/nvidia-fabricmanager/socket", + "containerPath": "/run/nvidia-fabricmanager/socket", + "options": [ "ro", "nosuid", "nodev", "bind", "noexec" ] }, + { "hostPath": "/usr/bin/nvidia-smi", + "containerPath": "/usr/bin/nvidia-smi", + "options": [ "ro", "nosuid", "nodev", "bind" ] }, + { "hostPath": "/usr/lib/x86_64-linux-gnu/libcuda.so.535.161.08", + "containerPath": "/usr/lib/x86_64-linux-gnu/libcuda.so.535.161.08", + "options": [ "ro", "nosuid", "nodev", "bind" ] } ] + "hooks": [ + { "hookName": "createContainer", + "path": "/usr/bin/nvidia-ctk", + "args": [ + "nvidia-ctk", + "hook", "update-ldcache", + "--folder", "/usr/lib/x86_64-linux-gnu" ] } ] } +} diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 2771078e4..2bb7689a5 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -56,6 +56,25 @@ mounting SquashFS images with FUSE. :code:`-c`, :code:`--cd=DIR` Initial working directory in container. + :code:`--cdi-dirs=DIRS` + Colon-separated list of directories to search for CDI JSON specifications. + Default: :code:`CH_RUN_CDI_DIRS` if set, otherwise + :code:`/etc/cdi:/var/run/cdi`. + + :code:`-d`, :code:`--devices` + Inject default CDI devices into the container. The default devices are + those listed in :code:`CH_RUN_CDI_DEFAULT` if set, otherwise all devices + for which a specification is found. Implies :code:`--write-fake`. + + :code:`--device=DEV[,DEV]` + Inject CDI device(s) identified by comma-separated :code:`DEV`. These are + either (1) a filename, if :code:`DEV` starts with a slash (:code:`/`) or + dot (:code:`.`), e.g. :code:`/etc/cdi/nvidia.json`, or (2) a CDI selector + for a list of devices in a CDI specification file, e.g. + :code:`nvidia.com/gpu`. Specific devices may not be selected, e.g. + :code:`nvidia.com/gpu=1:0` is invalid (see below for why). Implies + :code:`--write-fake`. Can be repeated. + :code:`--env-no-expand` Don’t expand variables when using :code:`--set-env`. @@ -345,6 +364,7 @@ Caveats: * Many of the arguments given to the race losers, such as the image path and :code:`--bind`, will be ignored in favor of what was given to the winner. + .. _ch-run_overlay: Writeable overlay with :code:`--write-fake` @@ -375,6 +395,188 @@ requires kernel support. Specifically: and thus is not helpful for unprivileged containers.) +Injecting host “devices” with Container Device Interface (CDI) +============================================================== + +Overview of CDI +--------------- + +`Container Device Interface (CDI) +`_ +is an emerging `Cloud Native Computing Foundation (CNCF) +`_ standard to specify how “devices” are made available +to containers. Importantly, a CDI *device* is not a hardware gadget nor a +device file but rather a set of container modifications to be done before +invoking the user command. It’s intended to make devices (in the usual sense +of hardware gadgets) available inside containers but is quite flexible. A CDI +device can specify multiple device files, environment variables, mounts, and +more. Christopher Desiniotis gave a good talk at Container Plumbing Days 2024 +introducing CDI (`slides +`_, +`video `_). + +CDI devices are described in JSON *specification files*, which are declarative +except they provide for arbitrary hook programs. However, Charliecloud treats +them as fully declarative by interpreting hooks as a declarative statement +rather than a program to be run (brittle, but works for now). This +declarativeness has a significant advantage over OCI hooks, because we have a +clear description of what needs to be done rather than needing to run opaque +programs as hooks. + +Another advantage of CDI is that it’s largely orthogonal to OCI. While the +specifications have a strong OCI framing, this is largely an artifact of the +exposition style rather than a core notion. + +Here is an example spec file: + +.. literalinclude:: cdi-nvidia.json + :language: JSON + +This declares: + +#. A single CDI device called :code:`nvidia.com/gpu=foo`, comprising: + + #. Two device files to be made available in the container, + :code:`/dev/nvidia0` and :code:`/dev/dri/card0`. + + #. One symlink to create inside the container, + :code:`/dev/by-path/pci-0000:07:00.0-card` → :code:`../card0`. + +#. A set of container changes to be made once regardless of which devices are + selected (this example has one, but real spec files have several), + comprising: + + #. One environment variable to set, :code:`NVIDIA_VISIBLE_DEVICES`. + + #. Two device files to be made available in the container, + :code:`/dev/nvidia-modeset` and :code:`/dev/nvidiactl`. + + #. A socket (:code:`/run/nvidia-fabricmanager/socket`), executable + (:code:`nvidia-smi`), and shared library + (:code:`libcuda.so.535.161.08`) to be bind-mounted into the + container. + + #. Run the *host* :code:`ldconfig` to update the *container* linker cache, + scanning only container directory :code:`/usr/lib/x86_64-linux-gnu`. + +Charliecloud’s CDI implementation +--------------------------------- + +Charliecloud has some differences from other container implementations in how +this spec file is interpreted, but the results (working CDI devices) should be +the same. These are: + +#. All CDI devices available to the user normally are also available in the + container. For example, some implementations allow + :code:`--device=nvidia.com/gpu=foo`, which puts only the GPU named + :code:`foo` in the container, but :code:`ch-run` accepts only + :code:`--device=nvidia.com/gpu` (and similarly in + :code:`CH_RUN_CDI_DEFAULT`). This is because the host :code:`/dev` is + bind-mounted into Charliecloud containers, so there is no need to deal with + individual device files. + +#. Hooks are interpreted declaratively rather than running the specified + program. This is because we have not yet encountered any hooks that are + both useful under Charliecloud and do a task that merits an external + program. See below for details on individual hooks. + +#. Only bind mounts are implemented, because unprivileged mount namespaces + can’t mount much that is meaningful, and we haven’t seen any other mount + types yet. + +#. Charliecloud minimizes the number of bind mounts to avoid bloating the + container filesystem tree. (The spec file for one of our not-that-large + systems declares 47 mounts!) We do this by bind-mounting each filesystem + represented in a host path once and then symlinking into it for the + declared bind mounts. + +Command line options and environment variables +---------------------------------------------- + +:code:`ch-run` must do two things to make CDI devices available: (1) locate +appropriate specification files and (2) select which kinds of CDI devices to +inject. We assume further that the most common use case is to inject all +available CDI devices. The design of Charliecloud’s CDI user interface follows +from these principles. + +TL;DR: The intended most common usage is simply :code:`ch-run -d` to inject +all available CDI devices, using prior configuration by users or admins. + +Available spec files are those in the colon-separated list of directories in +:code:`--cdi-dirs=DIRS` if given, otherwise in :code:`CH_RUN_CDI_DIRS`, +otherwise :code:`/etc/cdi:/var/run/cdi` as required by the standard. + +The option :code:`--devices` (plural) or :code:`-d` then injects all devices +found in all spec files in these directories. + +Individual CDI device kinds can be selected with :code:`--device=DEV` +(singular), where :code:`DEV` is a device identifier. If it identifier starts +with slash (:code:`/`) or dot (:code:`.`), the identifier is a path to a JSON +CDI spec file, and all devices in that file are injected (e.g., +:code:`--device=./foo.json`). Otherwise, it is a CDI device kind with no +device name(s) (e.g., :code:`--device=nvidia.com/gpu`). The option can be +repeated to inject multiple device kinds. + +Importantly, both :code:`--device` and :code:`--devices` imply +:code:`--write-fake` (:code:`-W`) so the container image can be written. + +Hooks +------ + +Behavior summary +~~~~~~~~~~~~~~~~ + +Presently, CDI hooks fall into three categories for Charliecloud: + +#. **Known hooks that we need**, with behavior emulated internally (i.e, we do + what the hook does, adapted for Charliecloud, rather than running it). + +#. **Known hooks that we don’t need**; we ignore these quietly (i.e., logged but + a level hidden by default). + +#. **Unknown hooks.** We warn about these, because they need to be either moved + into one of the first to categories or actually run. (That is, we’re still + figuring out what’s needed for Charliecloud here.) + +The next two sections document known hooks. + +.. note:: + + `nVidia Container Toolkit + `_ + CDI hooks can be spelled either `either + `_ + :code:`nvidia-ctk hook` (two words) or :code:`nvidia-ctk-hook` (one word). + We treat the two spellings the same. + +Emulated hooks +~~~~~~~~~~~~~~ + +#. :code:`nvidia-ctk-hook update-ldcache` . This updates the container’s + linker cache (i.e., :code:`/etc/ld.so.cache`), `notably using + `_ + the *host’s* :code:`ldconfig`. For now at least, we instead use the + *container’s* :code:`ldconfig`, the reasoning being that (1) the + container’s linker updating its own cache is lower-risk compatibility wise + and (2) it seems unlikely that an image would be compatible with nVidia + libraries and have a linker cache but no :code:`ldconfig` executable. + + If the image has no :code:`ldconfig`, :code:`ch-run` exits with an error + and the container does not run. This indicates the assumption above is + false, so please report this error as a bug. + +Ignored hooks +~~~~~~~~~~~~~ + +#. :code:`nvidia-ctk-hook create-symlinks`. This creates one or more symlinks. + In our experience, the links created already exist in the host’s + :code:`/dev` or are created by :code:`ldconfig(8)`. + +#. :code:`nvidia-ctk-hook chmod`. This changes file permissions, but in + unprivileged Charliecloud containers, the invoking user will already have + access to all appropriate files. + + Environment variables ===================== @@ -760,4 +962,6 @@ status is 1 regardless of the signal value. .. include:: ./see_also.rst .. LocalWords: mtune NEWROOT hugetlbfs UsrMerge fusermount mybox IMG HOSTPATH -.. LocalWords: noprofile norc SHLVL PWD kernelnewbies extglob +.. LocalWords: noprofile norc SHLVL PWD kernelnewbies extglob cdi AMMVs dri +.. LocalWords: Desiniotis declarativeness fabricmanager libglxserver ctk +.. LocalWords: libcuda ldcache From 862fcfa53417d068f9814dcca92b75c5938b3a42 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 21 May 2024 17:14:33 -0600 Subject: [PATCH 03/29] more build stuff [skip ci] --- bin/Makefile.am | 3 +++ bin/ch-run.c | 6 ++++++ bin/ch_json.c | 10 ++++++++++ bin/ch_json.h | 0 configure.ac | 30 ++++++++++++++++-------------- 5 files changed, 35 insertions(+), 14 deletions(-) create mode 100644 bin/ch_json.c create mode 100644 bin/ch_json.h diff --git a/bin/Makefile.am b/bin/Makefile.am index 0b2b9a77e..72fad95d0 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -9,6 +9,9 @@ bin_PROGRAMS = ch-checkns ch-run ch_checkns_SOURCES = ch-checkns.c ch_misc.h ch_misc.c ch_run_SOURCES = ch-run.c ch_core.h ch_core.c ch_misc.h ch_misc.c +if HAVE_JSON +ch_run_SOURCES += ch_json.h ch_json.c +endif if HAVE_LIBSQUASHFUSE ch_run_SOURCES += ch_fuse.h ch_fuse.c endif diff --git a/bin/ch-run.c b/bin/ch-run.c index 774f02ed9..01c06e6e4 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -52,6 +52,12 @@ const struct argp_option options[] = { { "bind", 'b', "SRC[:DST]", 0, "mount SRC at guest DST (default: same as SRC)"}, { "cd", 'c', "DIR", 0, "initial working directory in container"}, +#ifdef HAVE_JSON + { "cdi-dirs", -19, "DIRS", 0, "director(y|ies) containing CDI specs" }, + { "device", -18, "DEV[,DEV]", 0, + "inject CDI device DEV (can be repeated)" }, + { "devices", 'd', 0, 0, "inject default CDI devices" }, +#endif { "env-no-expand", -10, 0, 0, "don't expand $ in --set-env input"}, { "feature", -11, "FEAT", 0, "exit successfully if FEAT is enabled" }, { "gid", 'g', "GID", 0, "run as GID within container" }, diff --git a/bin/ch_json.c b/bin/ch_json.c new file mode 100644 index 000000000..304152c90 --- /dev/null +++ b/bin/ch_json.c @@ -0,0 +1,10 @@ +/* Copyright © Triad National Security, LLC, and others. */ + +#define _GNU_SOURCE + +#include "config.h" + +#include CJSON_H + +#include "ch_json.h" +#include "ch_misc.h" diff --git a/bin/ch_json.h b/bin/ch_json.h new file mode 100644 index 000000000..e69de29bb diff --git a/configure.ac b/configure.ac index 1bc4ca95e..f15cc6dfd 100644 --- a/configure.ac +++ b/configure.ac @@ -477,7 +477,7 @@ AC_CHECK_DECL(FNM_EXTMATCH, # library we find. Hopefully that’s not a problem. AS_IF([test $want_json = yes], [ AC_CHECK_LIB(cjson, cJSON_ParseWithLength, - [have_libcjson=yes, + [have_libcjson=yes AS_IF([test -n "$lib_json"], [CH_RUN_LIBS="-Wl,-rpath=$lib_json $CH_RUN_LIBS"]) CH_RUN_LIBS="-lcjson $CH_RUN_LIBS"], @@ -489,11 +489,11 @@ AS_IF([test $want_json = yes], [ # and define a macro. Double quotes support bundling it with Charliecloud. AC_CHECK_HEADER([cJSON.h], [have_cjson_h=yes - CJSON_H='"cJSON.h"'], + cjson_h='"cJSON.h"'], [AC_CHECK_HEADER([cjson/cJSON.h], [have_cjson_h=yes - CJSON_H='"cjson/cJSON.h"'], - [CJSON_H='not found' + cjson_h='"cjson/cJSON.h"'], + [cjson_h='not found' have_cjson_h=no])]) ], [have_libcjson=no have_cjson_h=no]) @@ -502,7 +502,7 @@ AS_IF([test $have_libcjson = yes && test $have_cjson_h = yes], [have_json=yes], [have_json=no]) AS_IF([test $need_json = yes && test $have_json = no], - [AC_MSG_ERROR([--with-json=yes but cJSON.h not found])]) + [AC_MSG_ERROR([--with-json=yes but cJSON not found])]) # Should we build seccomp? AC_MSG_CHECKING([for seccomp filter support]) @@ -837,14 +837,13 @@ CH_CHECK_VERSION([WGET], [$vmin_wget], [--version | head -1 | cut -d' ' -f3]) # the output Makefile. It *does not* create a Make variable. # # 4. AC_DEFINE(foo, value, comment) #define’s the preprocessor symbol foo to -# value in config.h. (Supposedly value and comment are optional but I got -# warnings doing that.) So this is how you make configure values -# available in C code (as macros, not variables). Typically you would -# define something or not (allowing #ifdef), rather than always define to -# true or false (which would require #if). +# value in config.h. (Supposedly, value and comment are optional but I +# got warnings doing that.) Importantly, value is not expanded. This is +# good for either defining or not defining a C macro; you can then use +# #ifdef to gate on that macro. # -# 5. AC_DEFINE_UNQUOTES adds some extra transformations to the above. I -# didn’t quite follow. +# 5. AC_DEFINE_UNQUOTED also expands value. This is good for defining a C +# macro to the actual value of some configure variable. # # Below are all the variables we want available outside configure. @@ -857,7 +856,6 @@ AS_IF([test $enable_syslog = yes], AM_CONDITIONAL([ENABLE_TEST], [test $enable_test = yes]) AC_SUBST([CH_RUN_LIBS]) -AC_SUBST([CJSON_H]) AC_SUBST([PYTHON_SHEBANG]) AC_SUBST([SPHINX]) @@ -869,6 +867,10 @@ AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) AS_IF([test $have_seccomp = yes], [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) +AM_CONDITIONAL([HAVE_JSON], [test $have_json = yes]) +AS_IF([test $have_json = yes], + [AC_DEFINE([HAVE_JSON], [1], [enable JSON features]) + AC_DEFINE_UNQUOTED([CJSON_H], [$cjson_h], [cJSON.h location])]) AM_CONDITIONAL([HAVE_LIBSQUASHFUSE], [test $have_libsquashfuse = yes]) AS_IF([test $have_libsquashfuse = yes], [AC_DEFINE([HAVE_LIBSQUASHFUSE], [1], [link with libsquashfuse])]) @@ -888,7 +890,7 @@ AS_IF([ test $have_userns = yes], AS_IF([ test $want_json = yes], [libcjson_note=$have_libcjson AS_IF([test $have_cjson_h = yes], - [cjson_h_note="yes, $CJSON_H"], + [cjson_h_note="yes, $cjson_h"], [cjson_h_note=no])], [libcjson_note='not tested' cjson_h_note='not tested']) From 05b631736b183f13d918e8cfe378e2d7a95890d1 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 31 May 2024 16:55:40 -0600 Subject: [PATCH 04/29] record --device arguments [skip ci] --- bin/ch-run.c | 38 ++++++++++++++++++++++++++++++++++---- bin/ch_core.h | 3 +++ bin/ch_fuse.h | 1 + bin/ch_json.c | 35 +++++++++++++++++++++++++++++++++++ bin/ch_json.h | 17 +++++++++++++++++ bin/ch_misc.h | 2 ++ 6 files changed, 92 insertions(+), 4 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 01c06e6e4..dc18d8528 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -14,6 +14,9 @@ #include "config.h" #include "ch_core.h" +#ifdef HAVE_JSON +#include "ch_json.h" +#endif #include "ch_misc.h" @@ -54,8 +57,7 @@ const struct argp_option options[] = { { "cd", 'c', "DIR", 0, "initial working directory in container"}, #ifdef HAVE_JSON { "cdi-dirs", -19, "DIRS", 0, "director(y|ies) containing CDI specs" }, - { "device", -18, "DEV[,DEV]", 0, - "inject CDI device DEV (can be repeated)" }, + { "device", -18, "DEV", 0, "inject CDI device(s) DEV (repeatable)" }, { "devices", 'd', 0, 0, "inject default CDI devices" }, #endif { "env-no-expand", -10, 0, 0, "don't expand $ in --set-env input"}, @@ -98,6 +100,9 @@ const struct argp_option options[] = { struct args { struct container c; struct env_delta *env_deltas; +#ifdef HAVE_JSON + char ** cdi_devids; +#endif char *initial_dir; #ifdef HAVE_SECCOMP bool seccomp_p; @@ -119,7 +124,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state); void parse_set_env(struct args *args, char *arg, int delim); void privs_verify_invoking(); char *storage_default(void); -extern void warnings_reprint(void); +void write_fake_enable(struct args *args, char *overlay_size); /** Global variables **/ @@ -173,6 +178,7 @@ int main(int argc, char *argv[]) .private_tmp = false, .type = IMG_NONE, .writable = false }, + .cdi_devids = list_new(sizeof(char *), 0), .env_deltas = list_new(sizeof(struct env_delta), 0), .initial_dir = NULL, #ifdef HAVE_SECCOMP @@ -252,12 +258,14 @@ int main(int argc, char *argv[]) #endif VERBOSE("unsafe: %d", args.unsafe); + cdi_update(&args.c, args.cdi_devids); containerize(&args.c); fix_environment(&args); #ifdef HAVE_SECCOMP if (args.seccomp_p) seccomp_install(); #endif + // run_command(ldconfig, true, NULL); // FIXME run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } @@ -512,6 +520,13 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) else FATAL("invalid --test argument: %s; see source code", arg); break; +#ifdef HAVE_JSON + case -18: // --device + Te (strlen(arg) > 0, "--device: DEV must be longer than zero"); + write_fake_enable(args, NULL); + list_append((void **)&(args->cdi_devids), &arg, sizeof(arg)); + break; +#endif case 'b': { // --bind char *src, *dst; for (i = 0; args->c.binds[i].src != NULL; i++) // count existing binds @@ -579,7 +594,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) args->c.writable = true; break; case 'W': // --write-fake - args->c.overlay_size = arg != NULL ? arg : WRITE_FAKE_DEFAULT; + write_fake_enable(args, arg); break; case ARGP_KEY_NO_ARGS: argp_state_help(state, stderr, ( ARGP_HELP_SHORT_USAGE @@ -652,3 +667,18 @@ char *storage_default(void) return storage; } + +/* Enable the overlay if not already enabled. */ +void write_fake_enable(struct args *args, char *overlay_size) +{ + if (overlay_size != NULL) { + // new overlay size specified: use it regardless of previous enablement + args->c.overlay_size = overlay_size; + } else if (args->c.overlay_size == NULL) { + // no new size, not yet enabled: enable with default size + args->c.overlay_size = WRITE_FAKE_DEFAULT; + } else { + // no new size, already enabled: keep existing size, nothing to do + T_ (args->c.overlay_size != NULL); + } +} diff --git a/bin/ch_core.h b/bin/ch_core.h index f65cfc083..d6ba1fb75 100644 --- a/bin/ch_core.h +++ b/bin/ch_core.h @@ -3,7 +3,10 @@ This interface contains Charliecloud's core containerization features. */ #define _GNU_SOURCE +#pragma once + #include +#include /** Types **/ diff --git a/bin/ch_fuse.h b/bin/ch_fuse.h index 5250ed85a..bc756c54d 100644 --- a/bin/ch_fuse.h +++ b/bin/ch_fuse.h @@ -1,6 +1,7 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#pragma once /** Function prototypes **/ diff --git a/bin/ch_json.c b/bin/ch_json.c index 304152c90..3e645af0b 100644 --- a/bin/ch_json.c +++ b/bin/ch_json.c @@ -8,3 +8,38 @@ #include "ch_json.h" #include "ch_misc.h" + + +/** Macros **/ + + +/** Constants **/ + + +/** Global variables **/ + + +/** Function prototypes (private) **/ + + +/** Functions **/ + +/* Update container configuration c according to CDI arguments given. Note + that here we just tidy up the configuration. Actually doing things (e.g. + bind mounts) happens later. */ +void cdi_update(struct container *c, char **devids) +{ + // read CDI spec files in configured directories + + // read CDI spec files specifically requested + + // filter device kinds to those requested + + // figure out bind mounts actually needed and set up symlinks + + // set ldconfig bit + + for (size_t i = 0; devids[i] != NULL; i++) { + VERBOSE("CDI device request %d: %s", i, devids[i]); + } +} diff --git a/bin/ch_json.h b/bin/ch_json.h index e69de29bb..187201cdc 100644 --- a/bin/ch_json.h +++ b/bin/ch_json.h @@ -0,0 +1,17 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains all functions that deal with JSON: OCI, CDI, and + friends. */ + +#define _GNU_SOURCE +#pragma once + +#include "config.h" +#include "ch_core.h" + +#include CJSON_H + + +/** Function prototypes **/ + +void cdi_update(struct container *c, char ** devids); diff --git a/bin/ch_misc.h b/bin/ch_misc.h index f590a0890..0144b792c 100644 --- a/bin/ch_misc.h +++ b/bin/ch_misc.h @@ -5,6 +5,8 @@ libraries that ch_core requires. */ #define _GNU_SOURCE +#pragma once + #include #include #include From 64766f887a56679277b4947fd6661fc14a95179f Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 7 Jun 2024 17:52:04 -0600 Subject: [PATCH 05/29] snapshot [skip ci] --- bin/ch_json.c | 149 ++++++++++++++++++++++++++++++++++++++++++++- bin/ch_json.h | 11 ++++ bin/ch_misc.c | 91 ++++++++++++++++++++------- misc/gdb-backtrace | 8 +++ 4 files changed, 234 insertions(+), 25 deletions(-) create mode 100755 misc/gdb-backtrace diff --git a/bin/ch_json.c b/bin/ch_json.c index 3e645af0b..f4ec964b4 100644 --- a/bin/ch_json.c +++ b/bin/ch_json.c @@ -1,6 +1,9 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#include +#include +#include #include "config.h" @@ -13,25 +16,132 @@ /** Macros **/ +/** Types **/ + +struct json_dispatch { + char *name; + struct json_dispatch *children; + void (*f)(cJSON *tree, void *state); +}; +#define JDF void (*)(cJSON *, void *) /* to cast callbacks in dispatch tables */ + + /** Constants **/ +// Block size in bytes for reading JSON files. +const size_t READ_SZ = 16384; + + +/** Function prototypes (private) **/ + +void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new); +struct cdi_spec *cdi_read(const char *path); +void visit(struct json_dispatch actions[], cJSON *tree, void *state); +void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); + +// parser callbacks +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); + /** Global variables **/ +/* Callback tables. In the struct, the callback’s second argument is “void *” + so any state object can be provided. However, we’d prefer the actual + functions to take the correct pointer type; thus, they need to be cast. + Alternatives include: -/** Function prototypes (private) **/ + 1. Cast every use of the variable in the callbacks. This seemed verbose + and error-prone. + + 2. Add a local variable of the correct type to each callback. I thought + distributed boilerplate like this seemed worse. */ +struct json_dispatch cdiPD_root[] = { + { "kind", NULL, (JDF)cdiPC_kind }, + { } +}; /** Functions **/ +/* Add spec to the given list of CDI specs, which is an out parameter. If + we’ve seen the spec’s kind before, replace the existing spec with the same + kind. Otherwise, append the new spec. */ +void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new) +{ + if (*specs != NULL) + for (size_t i = 0; (*specs)[i] != NULL; i++) + if (!strcmp((*specs)[i]->kind, spec_new->kind)) { + DEBUG("CDI: spec %s: replacing at %d", spec_new->kind, i); + free((*specs)[i]); + *specs[i] = spec_new; + return; + } + // don’t alread have the kind if we got through the loop + DEBUG("CDI: spec %s: new", spec_new->kind); + list_append((void **)specs, spec_new, sizeof(spec_new)); +} + +/* Read and parse the CDI spec file at path. Return a pointer to the parsed + struct, which the caller is responsible for freeing. If something goes + wrong, exit with error. */ +struct cdi_spec *cdi_read(const char *path) +{ + FILE *fp; + char *text = NULL; + const char *parse_end; + cJSON *tree; + struct cdi_spec *spec = NULL; + + // Read file into string. Allocate incrementally rather than seeking so + // non-seekable input works. + Tf (fp = fopen(path, "rb"), "CDI: can't open: %s", path); + for (size_t used = 0, avail = READ_SZ; true; avail += READ_SZ) { + T_ (text = realloc(text, avail)); + size_t read_ct = fread(text + used, 1, READ_SZ, fp); + used += read_ct; + if (read_ct < READ_SZ) { + if (feof(fp)) { // EOF reached + text[used] = '\0'; // ensure string ended + break; + } + Tf(0, "CDI: can't read: %s", path); + } + } + + // Parse JSON. + tree = cJSON_ParseWithOpts(text, &parse_end, false); + Tf(tree != NULL, "CDI: JSON failed at byte %d: %s", parse_end - text, path); + + // Visit parse tree to build our struct. + T_ (spec = malloc(sizeof(struct cdi_spec))); + visit(cdiPD_root, tree, spec); + + Tf (false, "haha you %s", "suck"); + + // Clean up. + VERBOSE("CDI: spec read OK: %s: %s", spec->kind, path); + free(text); + cJSON_Delete(tree); + return spec; +} + /* Update container configuration c according to CDI arguments given. Note that here we just tidy up the configuration. Actually doing things (e.g. bind mounts) happens later. */ void cdi_update(struct container *c, char **devids) { + struct cdi_spec **specs = NULL; + // read CDI spec files in configured directories // read CDI spec files specifically requested + for (size_t i = 0; devids[i] != NULL; i++) + if (devids[i][0] == '.' || devids[i][0] == '/') { + cdi_add(&specs, cdi_read(devids[i])); + // FIXME: add kind to requested list + } + + // debugging: print parsed CDI specs // filter device kinds to those requested @@ -39,7 +149,40 @@ void cdi_update(struct container *c, char **devids) // set ldconfig bit - for (size_t i = 0; devids[i] != NULL; i++) { - VERBOSE("CDI device request %d: %s", i, devids[i]); + // clean up + //for (size_t i = 0; specs[i] != NULL; i++) + // cdi_free(specs[i]); + free(specs); +} + +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) +{ + T_ (spec->kind = strdup(tree->valuestring)); +} + +/* Visit each node in the parse tree in depth-first order. At each node, if + there is a matching callback in actions, call it. For arrays, call the + callback once per array element. */ +void visit(struct json_dispatch actions[], cJSON *tree, void *state) +{ + for (int i = 0; actions[i].name != NULL; i++) { + cJSON *subtree = cJSON_GetObjectItem(tree, actions[i].name); + if (cJSON_IsArray(subtree)) { + cJSON *elem; + cJSON_ArrayForEach(elem, subtree) + visit_dispatch(actions[i], elem, state); + } else { + visit_dispatch(actions[i], subtree, state); + } } } + +/* Call the appropriate callback for the the root node of tree, if any. Then + visit its children, if any. */ +void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state) +{ + if (action.f != NULL) + action.f(tree, state); + if (action.children != NULL) + visit(action.children, tree, state); +} diff --git a/bin/ch_json.h b/bin/ch_json.h index 187201cdc..34056b339 100644 --- a/bin/ch_json.h +++ b/bin/ch_json.h @@ -8,10 +8,21 @@ #include "config.h" #include "ch_core.h" +#include "ch_misc.h" #include CJSON_H +/** Types **/ + +struct cdi_spec { + char *kind; + bool ldconfig_p; + struct env_var *envs; + struct bind *binds; +}; + + /** Function prototypes **/ void cdi_update(struct container *c, char ** devids); diff --git a/bin/ch_misc.c b/bin/ch_misc.c index bdee7fa20..305cf724e 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -32,6 +32,31 @@ #define SUPP_GIDS_MAX 128 +/** Constants **/ + +/* Text colors. In principle, we should be using a library for this, e.g. + terminfo(5). However, moderately thorough web searching suggests that + pretty much any modern terminal will support 256-color ANSI codes, and this + is way simpler [1]. Probably should coordinate these colors with the Python + code somehow. + + [1]: https://stackoverflow.com/a/3219471 */ +/* +static const char *COLOUR_CYAN_DARK = "38;5;6m"; +static const char *COLOUR_CYAN_LIGHT = "38;5;14m"; +static const char *COLOUR_RED = "31m"; +static const char *COLOUR_RED_BOLD = "1;31m"; +static const char *COLOUR_RESET = "0m"; +static const char *COLOUR_YELLOW = "33m"; +static const char *_LL_COLOURS[] = { COLOUR_RED_BOLD, // fatal + COLOUR_RED_BOLD, // stderr + COLOUR_RED, // warning + COLOUR_YELLOW, // info + COLOUR_CYAN_LIGHT, // verbose + COLOUR_CYAN_DARK, // debug + COLOUR_CYAN_DARK } // trace +*/ + /** External variables **/ /* Level of chatter on stderr. */ @@ -602,45 +627,67 @@ noreturn void msg_fatal(const char *file, int line, int errno_, void msgv(enum log_level level, const char *file, int line, int errno_, const char *fmt, va_list ap) { - char *message, *ap_msg; - - if (level > verbose) + // note: all components contain appropriate leading/trailing space + // note: be careful about which components need to be freed + char *text_formatted; // caller’s message, formatted + char *level_prefix; // level prefix + char *errno_code; // errno code/number + char *errno_desc; // errno description + char *text_full; // complete text but w/o color codes +// char *colour; // ANSI codes for color +// char *colour_reset; // ANSI codes to reset color + + if (level > verbose) // not verbose enough to log message; do nothing return; - T_ (1 <= asprintf(&message, "%s[%d]: ", - program_invocation_short_name, getpid())); + // Format caller message. + if (fmt == NULL) + text_formatted = "please report this bug"; // users should not see + else + T_ (1 <= vasprintf(&text_formatted, fmt, ap)); - // Prefix for the more urgent levels. + // Prefix some of the levels. switch (level) { case LL_FATAL: - message = cat(message, "error: "); // "fatal" too morbid for users + level_prefix = "error: "; // "fatal" too morbid for users break; case LL_WARNING: - message = cat(message, "warning: "); + level_prefix = "warning: "; break; default: + level_prefix = ""; break; } - // Default message if not specified. Users should not see this. - if (fmt == NULL) - fmt = "please report this bug"; - - T_ (1 <= vasprintf(&ap_msg, fmt, ap)); - if (errno_) { - T_ (1 <= asprintf(&message, "%s%s: %s (%s:%d %d)", message, ap_msg, - strerror(errno_), file, line, errno_)); + // errno. + if (!errno_) { + errno_code = ""; + errno_desc = ""; } else { - T_ (1 <= asprintf(&message, "%s%s (%s:%d)", message, ap_msg, file, line)); + errno_code = cat(" ", strerrorname_np(errno_)); // FIXME: non-portable + T_ (1 <= asprintf(&errno_desc, ": %s", strerror(errno_))); } - if (level == LL_WARNING) { - warnings_offset += string_append(warnings, message, WARNINGS_SIZE, - warnings_offset); - } - fprintf(stderr, "%s\n", message); + // Format and print. + T_ (1 <= asprintf(&text_full, "%s[%d]: %s%s%s (%s:%d%s)", + program_invocation_short_name, getpid(), + level_prefix, text_formatted, errno_desc, + file, line, errno_code)); + fprintf(stderr, "%s\n", text_full); if (fflush(stderr)) abort(); // can't print an error b/c already trying to do that + if (level == LL_WARNING) + warnings_offset += string_append(warnings, text_full, + WARNINGS_SIZE, warnings_offset); + + // Clean up. + free(text_full); + if (errno_) { + free(errno_code); + free(errno_desc); + } + if (fmt != NULL) + free(text_formatted); } /* Return true if the given path exists, false otherwise. On error, exit. If diff --git a/misc/gdb-backtrace b/misc/gdb-backtrace new file mode 100755 index 000000000..f5c5dbc85 --- /dev/null +++ b/misc/gdb-backtrace @@ -0,0 +1,8 @@ +#!/bin/bash + +gdb -batch $1 $2 \ + -ex 'set style enabled on' \ + -ex 'set print pretty on' \ + -ex 'set print frame-info source-and-location' \ + -ex 'echo \n\n' \ + -ex 'backtrace -full' From d3aa024cdf371637ee950ebfd8f4378d9a821859 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 11 Jun 2024 16:57:17 -0600 Subject: [PATCH 06/29] log in colo(u)r [skip ci] --- bin/ch_misc.c | 76 +++++++++++++++++++++++++++++++-------------------- bin/ch_misc.h | 1 + 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 305cf724e..697cd8c10 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -41,27 +41,31 @@ code somehow. [1]: https://stackoverflow.com/a/3219471 */ -/* -static const char *COLOUR_CYAN_DARK = "38;5;6m"; -static const char *COLOUR_CYAN_LIGHT = "38;5;14m"; -static const char *COLOUR_RED = "31m"; -static const char *COLOUR_RED_BOLD = "1;31m"; -static const char *COLOUR_RESET = "0m"; -static const char *COLOUR_YELLOW = "33m"; -static const char *_LL_COLOURS[] = { COLOUR_RED_BOLD, // fatal - COLOUR_RED_BOLD, // stderr - COLOUR_RED, // warning - COLOUR_YELLOW, // info - COLOUR_CYAN_LIGHT, // verbose - COLOUR_CYAN_DARK, // debug - COLOUR_CYAN_DARK } // trace -*/ +static const char COLOUR_CYAN_DARK[] = ""; +static const char COLOUR_CYAN_LIGHT[] = "\033[38;5;14m"; +static const char COLOUR_RED[] = ""; +static const char COLOUR_RED_BOLD[] = ""; +static const char COLOUR_RESET[] = ""; +static const char COLOUR_YELLOW[] = ""; +static const char *_LL_COLOURS[] = { COLOUR_RED_BOLD, // fatal + COLOUR_RED_BOLD, // stderr + COLOUR_RED, // warning + COLOUR_YELLOW, // info + COLOUR_CYAN_LIGHT, // verbose + COLOUR_CYAN_DARK, // debug + COLOUR_CYAN_DARK }; // trace +/* This lets us index by verbosity, which can be negative. */ +static const char **LL_COLOURS = _LL_COLOURS + 3; + /** External variables **/ /* Level of chatter on stderr. */ enum log_level verbose; +/* If true, use colored logging. */ +bool log_color_p = true; + /* Path to host temporary directory. Set during command line processing. */ char *host_tmp = NULL; @@ -634,8 +638,8 @@ void msgv(enum log_level level, const char *file, int line, int errno_, char *errno_code; // errno code/number char *errno_desc; // errno description char *text_full; // complete text but w/o color codes -// char *colour; // ANSI codes for color -// char *colour_reset; // ANSI codes to reset color + const char * colour; // ANSI codes for color + const char * colour_reset; // ANSI codes to reset color if (level > verbose) // not verbose enough to log message; do nothing return; @@ -668,12 +672,21 @@ void msgv(enum log_level level, const char *file, int line, int errno_, T_ (1 <= asprintf(&errno_desc, ": %s", strerror(errno_))); } + // Color. + if (log_color_p) { + colour = LL_COLOURS[level]; + colour_reset = COLOUR_RESET; + } else { + colour = ""; + colour_reset = ""; + }; + // Format and print. T_ (1 <= asprintf(&text_full, "%s[%d]: %s%s%s (%s:%d%s)", program_invocation_short_name, getpid(), level_prefix, text_formatted, errno_desc, file, line, errno_code)); - fprintf(stderr, "%s\n", text_full); + fprintf(stderr, "%s%s%s\n", colour, text_full, colour_reset); if (fflush(stderr)) abort(); // can't print an error b/c already trying to do that if (level == LL_WARNING) @@ -887,17 +900,20 @@ void warnings_reprint(void) size_t offset = 0; int warn_ct = buf_strings_count(warnings, WARNINGS_SIZE); - if (warn_ct > 0) - fprintf(stderr, "%s[%d]: warning: reprinting first %d warning(s)\n", - program_invocation_short_name, getpid(), warn_ct); - - while ( warnings[offset] != 0 - || (offset < (WARNINGS_SIZE - 1) && warnings[offset+1] != 0)) { - fputs(warnings + offset, stderr); - fputc('\n', stderr); - offset += strlen(warnings + offset) + 1; + if (warn_ct > 0) { + if (log_color_p) + T_ (EOF != fputs(LL_COLOURS[LL_WARNING], stderr)); + T_ (1 <= fprintf(stderr, "%s[%d]: reprinting first %d warning(s)\n", + program_invocation_short_name, getpid(), warn_ct)); + while ( warnings[offset] != 0 + || (offset < (WARNINGS_SIZE - 1) && warnings[offset+1] != 0)) { + T_ (EOF != fputs(warnings + offset, stderr)); + T_ (EOF != fputc('\n', stderr)); + offset += strlen(warnings + offset) + 1; + } + if (log_color_p) + T_ (EOF != fputs(COLOUR_RESET, stderr)); + if (fflush(stderr)) + abort(); // can't print an error b/c already trying to do that } - - if (fflush(stderr)) - abort(); // can't print an error b/c already trying to do that } diff --git a/bin/ch_misc.h b/bin/ch_misc.h index 0144b792c..6c92d95f6 100644 --- a/bin/ch_misc.h +++ b/bin/ch_misc.h @@ -109,6 +109,7 @@ enum log_level { LL_FATAL = -3, /** External variables **/ extern enum log_level verbose; +extern bool log_color_p; extern char *host_tmp; extern char *username; extern char *warnings; From f4037d0b5033dd1c380afb684f459181180d423c Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 11 Jun 2024 17:17:36 -0600 Subject: [PATCH 07/29] document --color [skip ci] --- doc/ch-run.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 2bb7689a5..3b2dbca22 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -61,6 +61,23 @@ mounting SquashFS images with FUSE. Default: :code:`CH_RUN_CDI_DIRS` if set, otherwise :code:`/etc/cdi:/var/run/cdi`. + :code:`--color[=WHEN]` + Color logging output by log level when :code:`WHEN`: + + * By default, or if :code:`WHEN` is :code:`auto`, :code:`tty`, + :code:`if-tty`: use color if standard error is a TTY; otherwise, + don’t use color. + + * If :code:`WHEN` is :code:`yes`, :code:`always`, or :code:`force`; or + if :code:`--color` is specified without an argument: always use + color. + + * If :code:`WHEN` is :code:`no`, :code:`never`, or :code:`none`: never + use color. + + This uses ANSI color codes without checking any terminal databases, which + should work on all modern terminals. + :code:`-d`, :code:`--devices` Inject default CDI devices into the container. The default devices are those listed in :code:`CH_RUN_CDI_DEFAULT` if set, otherwise all devices From 396ab76410996cd3ca0a3d68f561dc8312e57396 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 14 Jun 2024 15:02:41 -0600 Subject: [PATCH 08/29] implement --color [skip ci] --- bin/ch-run.c | 80 +++++++++++++++++++++++++++++++++++++++------------ bin/ch_misc.c | 50 +++++++++++++++++++++++++------- bin/ch_misc.h | 10 +++++++ 3 files changed, 110 insertions(+), 30 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index dc18d8528..b6965125a 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -20,6 +20,30 @@ #include "ch_misc.h" +/** Types **/ + +struct args { + struct container c; + struct env_delta *env_deltas; +#ifdef HAVE_JSON + char ** cdi_devids; +#endif + enum log_color_when log_color; + enum log_test log_test; + char *initial_dir; +#ifdef HAVE_SECCOMP + bool seccomp_p; +#endif + char *storage_dir; + bool unsafe; +}; + +struct log_color_synonym { + char *name; + enum log_color_when color; +}; + + /** Constants and macros **/ /* Environment variables used by --join parameters. */ @@ -33,6 +57,20 @@ char *JOIN_TAG_ENV[] = { "SLURM_STEP_ID", /* Default overlaid tmpfs size. */ char *WRITE_FAKE_DEFAULT = "12%"; +/* Log color WHEN synonyms. Note that no argument (i.e., bare --color) is + handled separately. */ +struct log_color_synonym log_color_synonyms[] = { + { "auto", LL_COLOR_AUTO }, + { "tty", LL_COLOR_AUTO }, + { "if-tty", LL_COLOR_AUTO }, + { "yes", LL_COLOR_YES }, + { "always", LL_COLOR_YES }, + { "force", LL_COLOR_YES }, + { "no", LL_COLOR_NO }, + { "never", LL_COLOR_NO }, + { "none", LL_COLOR_NO }, + { NULL, LL_COLOR_NULL } }; + /** Command line options **/ @@ -57,6 +95,10 @@ const struct argp_option options[] = { { "cd", 'c', "DIR", 0, "initial working directory in container"}, #ifdef HAVE_JSON { "cdi-dirs", -19, "DIRS", 0, "director(y|ies) containing CDI specs" }, +#endif + { "color", -20, "WHEN", OPTION_ARG_OPTIONAL, + "specify when to use colored logging" }, +#ifdef HAVE_JSON { "device", -18, "DEV", 0, "inject CDI device(s) DEV (repeatable)" }, { "devices", 'd', 0, 0, "inject default CDI devices" }, #endif @@ -95,23 +137,6 @@ const struct argp_option options[] = { }; -/** Types **/ - -struct args { - struct container c; - struct env_delta *env_deltas; -#ifdef HAVE_JSON - char ** cdi_devids; -#endif - char *initial_dir; -#ifdef HAVE_SECCOMP - bool seccomp_p; -#endif - char *storage_dir; - bool unsafe; -}; - - /** Function prototypes **/ void fix_environment(struct args *args); @@ -179,6 +204,8 @@ int main(int argc, char *argv[]) .type = IMG_NONE, .writable = false }, .cdi_devids = list_new(sizeof(char *), 0), + .log_color = LL_COLOR_AUTO, + .log_test = LL_TEST_NONE, .env_deltas = list_new(sizeof(struct env_delta), 0), .initial_dir = NULL, #ifdef HAVE_SECCOMP @@ -198,6 +225,7 @@ int main(int argc, char *argv[]) Z_ (argp_parse(&argp, argc, argv, 0, &arg_next, &args)); if (!argp_help_fmt_set) Z_ (unsetenv("ARGP_HELP_FMT")); + logging_init(args.log_color, args.log_test); if (arg_next >= argc - 1) { printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); @@ -514,9 +542,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; case -17: // --test if (!strcmp(arg, "log")) - test_logging(false); + args->log_test = LL_TEST_YES; else if (!strcmp(arg, "log-fail")) - test_logging(true); + args->log_test = LL_TEST_FATAL; else FATAL("invalid --test argument: %s; see source code", arg); break; @@ -527,6 +555,20 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) list_append((void **)&(args->cdi_devids), &arg, sizeof(arg)); break; #endif + case -20: // --color + if (arg == NULL) + args->log_color = LL_COLOR_AUTO; + args->log_color = LL_COLOR_NULL; + for (int i = 0; true; i++) { + if (log_color_synonyms[i].name == NULL) + break; + if (!strcmp(arg, log_color_synonyms[i].name)) { + args->log_color = log_color_synonyms[i].color; + break; + } + } + Tf (args->log_color != LL_COLOR_NULL, "--color: invalid arg: %s", arg); + break; case 'b': { // --bind char *src, *dst; for (i = 0; args->c.binds[i].src != NULL; i++) // count existing binds diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 697cd8c10..1f4218428 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -63,8 +63,8 @@ static const char **LL_COLOURS = _LL_COLOURS + 3; /* Level of chatter on stderr. */ enum log_level verbose; -/* If true, use colored logging. */ -bool log_color_p = true; +/* If true, use colored logging. Set in ch-run.c. */ +bool log_color_p = false; /* Path to host temporary directory. Set during command line processing. */ char *host_tmp = NULL; @@ -471,15 +471,43 @@ void log_ids(const char *func, int line) } } -void test_logging(bool fail) { - TRACE("trace"); - DEBUG("debug"); - VERBOSE("verbose"); - INFO("info"); - WARNING("warning"); - if (fail) - FATAL("the program failed inexplicably (\"log-fail\" specified)"); - exit(0); + +/* Set up logging. Note ch-run(1) specifies a bunch of + color synonyms; this translation happens during argument parsing.*/ +void logging_init(enum log_color_when when, enum log_test test) +{ + // set up colors + switch (when) { + case LL_COLOR_AUTO: + if (isatty(fileno(stderr))) + log_color_p = true; + else { + T_ (errno == ENOTTY); + log_color_p = false; + } + break; + case LL_COLOR_YES: + log_color_p = true; + break; + case LL_COLOR_NO: + log_color_p = false; + break; + case LL_COLOR_NULL: + Tf(0, "unreachable code reached"); + break; + } + + // test logging + if (test >= LL_TEST_YES) { + TRACE("trace"); + DEBUG("debug"); + VERBOSE("verbose"); + INFO("info"); + WARNING("warning"); + if (test >= LL_TEST_FATAL) + FATAL("the program failed inexplicably (\"log-fail\" specified)"); + exit(0); + } } /* Create the directory at path, despite its parent not allowing write access, diff --git a/bin/ch_misc.h b/bin/ch_misc.h index 6c92d95f6..5d3062056 100644 --- a/bin/ch_misc.h +++ b/bin/ch_misc.h @@ -105,6 +105,15 @@ enum log_level { LL_FATAL = -3, LL_DEBUG = 2, LL_TRACE = 3 }; +enum log_color_when { LL_COLOR_NULL = 0, + LL_COLOR_AUTO, + LL_COLOR_YES, + LL_COLOR_NO }; + +enum log_test { LL_TEST_NONE = 0, + LL_TEST_YES = 1, + LL_TEST_FATAL = 2 }; + /** External variables **/ @@ -132,6 +141,7 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); void *list_new(size_t size, size_t ct); void log_ids(const char *func, int line); +void logging_init(enum log_color_when when, enum log_test test); void test_logging(bool fail); void mkdirs(const char *base, const char *path, char **denylist, const char *scratch); From 867c532bbb56fd50417a82bd4f747bdca7922b86 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 14 Jun 2024 15:09:49 -0600 Subject: [PATCH 09/29] tidy colors [skip ci] --- bin/ch_misc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 1f4218428..11f36a541 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -34,7 +34,10 @@ /** Constants **/ -/* Text colors. In principle, we should be using a library for this, e.g. +/* Text colors. Note leading escape characters (U+001B), which don’t always + show up depending on your viewer. + + In principle, we should be using a library for this, e.g. terminfo(5). However, moderately thorough web searching suggests that pretty much any modern terminal will support 256-color ANSI codes, and this is way simpler [1]. Probably should coordinate these colors with the Python @@ -42,7 +45,7 @@ [1]: https://stackoverflow.com/a/3219471 */ static const char COLOUR_CYAN_DARK[] = ""; -static const char COLOUR_CYAN_LIGHT[] = "\033[38;5;14m"; +static const char COLOUR_CYAN_LIGHT[] = ""; static const char COLOUR_RED[] = ""; static const char COLOUR_RED_BOLD[] = ""; static const char COLOUR_RESET[] = ""; From e2e6badeb92b155666617cbaab86eeb5155465c1 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 18 Jun 2024 14:57:52 -0600 Subject: [PATCH 10/29] snapshot [skip ci] --- bin/ch_json.c | 90 ++++++++++++++++++++++++++++++++++++++++------ bin/ch_json.h | 7 ---- bin/ch_misc.c | 39 +++++++++++++++----- bin/ch_misc.h | 2 ++ misc/gdb-backtrace | 15 +++++++- 5 files changed, 126 insertions(+), 27 deletions(-) diff --git a/bin/ch_json.c b/bin/ch_json.c index f4ec964b4..362629432 100644 --- a/bin/ch_json.c +++ b/bin/ch_json.c @@ -9,6 +9,7 @@ #include CJSON_H +#include "ch_core.h" #include "ch_json.h" #include "ch_misc.h" @@ -18,6 +19,15 @@ /** Types **/ +struct cdi_spec { + char *kind; + char *src; // path to source spec file + bool requested; + bool ldconfig_p; + struct env_var *envs; + struct bind *binds; +}; + struct json_dispatch { char *name; struct json_dispatch *children; @@ -35,12 +45,15 @@ const size_t READ_SZ = 16384; /** Function prototypes (private) **/ void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new); +void cdi_free(struct cdi_spec *spec); +void cdi_log(struct cdi_spec *spec); struct cdi_spec *cdi_read(const char *path); void visit(struct json_dispatch actions[], cJSON *tree, void *state); void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); // parser callbacks void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); +void cdiPC_env(cJSON *tree, struct cdi_spec *spec); /** Global variables **/ @@ -54,9 +67,10 @@ void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); and error-prone. 2. Add a local variable of the correct type to each callback. I thought - distributed boilerplate like this seemed worse. */ + such distributed boilerplate seemed worse. */ struct json_dispatch cdiPD_root[] = { { "kind", NULL, (JDF)cdiPC_kind }, + { "env", NULL, (JDF)cdiPC_env }, { } }; @@ -78,7 +92,39 @@ void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new) } // don’t alread have the kind if we got through the loop DEBUG("CDI: spec %s: new", spec_new->kind); - list_append((void **)specs, spec_new, sizeof(spec_new)); + list_append((void **)specs, &spec_new, sizeof(spec_new)); +} + +/* Free spec. */ +void cdi_free(struct cdi_spec *spec) +{ + free(spec->kind); + free(spec->src); + for (size_t i = 0; spec->envs[i].name != NULL; i++) { + free(spec->envs[i].name); + free(spec->envs[i].value); + } + free(spec->envs); + free(spec); +} + +/* Log contents of spec. */ +void cdi_log(struct cdi_spec *spec) +{ + size_t ct; + + DEBUG("CDI: spec %s from %s:", spec->kind, spec->src); + DEBUG("CDI: devices requested: %s", bool_to_string(spec->requested)); + DEBUG("CDI: ldconfig(8) needed: %s", bool_to_string(spec->ldconfig_p)); + ct = list_count((void *)(spec->envs), sizeof(struct env_var)); + DEBUG("CDI: environment: %d:", ct); + for (size_t i = 0; i < ct; i++) + DEBUG("CDI: %s=%s", spec->envs[i].name, spec->envs[i].value); + ct = list_count((void *)(spec->binds), sizeof(struct bind)); + DEBUG("CDI: bind mounts: %d:", ct); + for (size_t i = 0; i < ct; i++) { + DEBUG("CDI: %s -> %s", spec->binds[i].src, spec->binds[i].dst); + } } /* Read and parse the CDI spec file at path. Return a pointer to the parsed @@ -113,10 +159,9 @@ struct cdi_spec *cdi_read(const char *path) Tf(tree != NULL, "CDI: JSON failed at byte %d: %s", parse_end - text, path); // Visit parse tree to build our struct. - T_ (spec = malloc(sizeof(struct cdi_spec))); - visit(cdiPD_root, tree, spec); - - Tf (false, "haha you %s", "suck"); + T_ (spec = calloc(1, sizeof(struct cdi_spec))); + T_ (spec->src = strdup(path)); + visit(cdiPD_root, tree, spec); // Clean up. VERBOSE("CDI: spec read OK: %s: %s", spec->kind, path); @@ -130,6 +175,7 @@ struct cdi_spec *cdi_read(const char *path) bind mounts) happens later. */ void cdi_update(struct container *c, char **devids) { + struct cdi_spec *spec; struct cdi_spec **specs = NULL; // read CDI spec files in configured directories @@ -137,11 +183,14 @@ void cdi_update(struct container *c, char **devids) // read CDI spec files specifically requested for (size_t i = 0; devids[i] != NULL; i++) if (devids[i][0] == '.' || devids[i][0] == '/') { - cdi_add(&specs, cdi_read(devids[i])); - // FIXME: add kind to requested list + spec = cdi_read(devids[i]); + spec->requested = true; + cdi_add(&specs, spec); } // debugging: print parsed CDI specs + for (size_t i = 0; specs[i] != NULL; i++) + cdi_log(specs[0]); // filter device kinds to those requested @@ -150,8 +199,8 @@ void cdi_update(struct container *c, char **devids) // set ldconfig bit // clean up - //for (size_t i = 0; specs[i] != NULL; i++) - // cdi_free(specs[i]); + for (size_t i = 0; specs[i] != NULL; i++) + cdi_free(specs[i]); free(specs); } @@ -160,11 +209,32 @@ void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) T_ (spec->kind = strdup(tree->valuestring)); } +void cdiPC_env(cJSON *tree, struct cdi_spec *spec) +{ + struct env_var ev; + size_t name_len, value_len; // not including null terminator + char *delim, *arnold; + + T_ (cJSON_IsString(tree)); + T_ (delim = strchr(tree->valuestring, '=')); + T_ (arnold = strchr(tree->valuestring, 0)); + + name_len = delim - tree->valuestring; + value_len = arnold - delim - 1; + T_ (ev.name = malloc(name_len + 1)); + memcpy(ev.name, tree->valuestring, name_len); + ev.name[name_len - 1] = 0; + T_ (ev.value = malloc(value_len + 1)); + memcpy(ev.value, delim + 1, value_len); + ev.value[value_len - 1] = 0; +} + /* Visit each node in the parse tree in depth-first order. At each node, if there is a matching callback in actions, call it. For arrays, call the callback once per array element. */ void visit(struct json_dispatch actions[], cJSON *tree, void *state) { + printf("visiting: %s\n", tree->valuestring); for (int i = 0; actions[i].name != NULL; i++) { cJSON *subtree = cJSON_GetObjectItem(tree, actions[i].name); if (cJSON_IsArray(subtree)) { diff --git a/bin/ch_json.h b/bin/ch_json.h index 34056b339..20329fe19 100644 --- a/bin/ch_json.h +++ b/bin/ch_json.h @@ -15,13 +15,6 @@ /** Types **/ -struct cdi_spec { - char *kind; - bool ldconfig_p; - struct env_var *envs; - struct bind *binds; -}; - /** Function prototypes **/ diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 11f36a541..6c60d9a40 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -174,6 +174,12 @@ char *argv_to_string(char **argv) return s; } +/* Return bool b as a string. */ +const char *bool_to_string(bool b) +{ + return (b ? "yes" : "no"); +} + /* Iterate through buffer “buf” of size “s” consisting of null-terminated strings and return the number of strings in it. Key assumptions: @@ -422,18 +428,25 @@ void list_append(void **ar, void *new, size_t size) int ct; T_ (new != NULL); - // count existing elements - if (*ar == NULL) - ct = 0; - else - for (ct = 0; !buf_zero_p((char *)*ar + ct*size, size); ct++) - ; - + ct = list_count(*ar, size); T_ (*ar = realloc(*ar, (ct+2)*size)); // existing + new + terminator memcpy((char *)*ar + ct*size, new, size); // append new (no overlap) memset((char *)*ar + (ct+1)*size, 0, size); // set new terminator } +/* Return the number of elements of size size in list *ar. */ +size_t list_count(void *ar, size_t size) +{ + size_t ct; + + if (ar == NULL) + return 0; + + for (ct = 0; !buf_zero_p((char *)ar + ct*size, size); ct++) + ; + return ct; +} + /* Return a pointer to a new, empty zero-terminated array containing elements of size size, with room for ct elements without re-allocation. The latter allows to pre-allocate an arbitrary number of slots in the list, which can @@ -447,7 +460,10 @@ void *list_new(size_t size, size_t ct) return list; } -/* If verbose, print uids and gids on stderr prefixed with where. */ +/* If verbose enough, print uids and gids on stderr prefixed with where. + + FIXME: Should change to DEBUG(), but that will give the file/line within + this function, which we don’t want. */ void log_ids(const char *func, int line) { uid_t ruid, euid, suid; @@ -455,9 +471,11 @@ void log_ids(const char *func, int line) gid_t supp_gids[SUPP_GIDS_MAX]; int supp_gid_ct; - if (verbose >= 3) { + if (verbose >= LL_TRACE + 1) { // don’t bother b/c haven’t needed in ages Z_ (getresuid(&ruid, &euid, &suid)); Z_ (getresgid(&rgid, &egid, &sgid)); + if (log_color_p) + T_ (EOF != fputs(LL_COLOURS[LL_TRACE], stderr)); fprintf(stderr, "%s %d: uids=%d,%d,%d, gids=%d,%d,%d + ", func, line, ruid, euid, suid, rgid, egid, sgid); supp_gid_ct = getgroups(SUPP_GIDS_MAX, supp_gids); @@ -471,6 +489,9 @@ void log_ids(const char *func, int line) fprintf(stderr, "%d", supp_gids[i]); } fprintf(stderr, "\n"); + if (log_color_p) + T_ (EOF != fputs(COLOUR_RESET, stderr)); + Z_ (fflush(stderr)); } } diff --git a/bin/ch_misc.h b/bin/ch_misc.h index 5d3062056..bfb8791a5 100644 --- a/bin/ch_misc.h +++ b/bin/ch_misc.h @@ -128,6 +128,7 @@ extern size_t warnings_offset; /** Function prototypes **/ char *argv_to_string(char **argv); +const char *bool_to_string(bool b); int buf_strings_count(char *str, size_t s); bool buf_zero_p(void *buf, size_t size); char *cat(const char *a, const char *b); @@ -139,6 +140,7 @@ void env_set(const char *name, const char *value, const bool expand); void env_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); +size_t list_count(void *ar, size_t size); void *list_new(size_t size, size_t ct); void log_ids(const char *func, int line); void logging_init(enum log_color_when when, enum log_test test); diff --git a/misc/gdb-backtrace b/misc/gdb-backtrace index f5c5dbc85..faddbb1a1 100755 --- a/misc/gdb-backtrace +++ b/misc/gdb-backtrace @@ -1,6 +1,19 @@ #!/bin/bash -gdb -batch $1 $2 \ +# $1 executable +# $2 core dump, or directory containing core dumps, in which case pick newest + +bin=$1 +core=$2 + +if [[ -d $core ]]; then + # kludge but good enough for now (https://stackoverflow.com/q/1015678) + printf "$core is a directory\n" 1>&2 + core=$core/$(ls -At $core | head -1) + printf "using $core\n" 1>&2 +fi + +gdb -batch $bin $core \ -ex 'set style enabled on' \ -ex 'set print pretty on' \ -ex 'set print frame-info source-and-location' \ From a941c9e945782412b6d461738ca5b89279c53137 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 18 Jun 2024 15:57:47 -0600 Subject: [PATCH 11/29] parse environment vars seems to work [skip ci] --- bin/ch_json.c | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/bin/ch_json.c b/bin/ch_json.c index 362629432..af7c59ac5 100644 --- a/bin/ch_json.c +++ b/bin/ch_json.c @@ -52,8 +52,9 @@ void visit(struct json_dispatch actions[], cJSON *tree, void *state); void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); // parser callbacks -void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); +void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec); void cdiPC_env(cJSON *tree, struct cdi_spec *spec); +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); /** Global variables **/ @@ -68,9 +69,14 @@ void cdiPC_env(cJSON *tree, struct cdi_spec *spec); 2. Add a local variable of the correct type to each callback. I thought such distributed boilerplate seemed worse. */ +struct json_dispatch cdiPD_containerEdits[] = { + { "env", NULL, (JDF)cdiPC_env }, + { } +}; struct json_dispatch cdiPD_root[] = { - { "kind", NULL, (JDF)cdiPC_kind }, - { "env", NULL, (JDF)cdiPC_env }, + { "cdiVersion", NULL, (JDF)cdiPC_cdiVersion }, + { "kind", NULL, (JDF)cdiPC_kind }, + { "containerEdits", cdiPD_containerEdits, }, { } }; @@ -113,7 +119,7 @@ void cdi_log(struct cdi_spec *spec) { size_t ct; - DEBUG("CDI: spec %s from %s:", spec->kind, spec->src); + DEBUG("CDI: %s from %s:", spec->kind, spec->src); DEBUG("CDI: devices requested: %s", bool_to_string(spec->requested)); DEBUG("CDI: ldconfig(8) needed: %s", bool_to_string(spec->ldconfig_p)); ct = list_count((void *)(spec->envs), sizeof(struct env_var)); @@ -204,9 +210,9 @@ void cdi_update(struct container *c, char **devids) free(specs); } -void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) +void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec) { - T_ (spec->kind = strdup(tree->valuestring)); + DEBUG("CDI: %s: version %s", spec->src, tree->valuestring); } void cdiPC_env(cJSON *tree, struct cdi_spec *spec) @@ -223,10 +229,17 @@ void cdiPC_env(cJSON *tree, struct cdi_spec *spec) value_len = arnold - delim - 1; T_ (ev.name = malloc(name_len + 1)); memcpy(ev.name, tree->valuestring, name_len); - ev.name[name_len - 1] = 0; + ev.name[name_len] = 0; T_ (ev.value = malloc(value_len + 1)); memcpy(ev.value, delim + 1, value_len); - ev.value[value_len - 1] = 0; + ev.value[value_len] = 0; + + list_append((void **)&spec->envs, &ev, sizeof(ev)); +} + +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) +{ + T_ (spec->kind = strdup(tree->valuestring)); } /* Visit each node in the parse tree in depth-first order. At each node, if @@ -234,15 +247,16 @@ void cdiPC_env(cJSON *tree, struct cdi_spec *spec) callback once per array element. */ void visit(struct json_dispatch actions[], cJSON *tree, void *state) { - printf("visiting: %s\n", tree->valuestring); for (int i = 0; actions[i].name != NULL; i++) { cJSON *subtree = cJSON_GetObjectItem(tree, actions[i].name); - if (cJSON_IsArray(subtree)) { - cJSON *elem; - cJSON_ArrayForEach(elem, subtree) - visit_dispatch(actions[i], elem, state); - } else { - visit_dispatch(actions[i], subtree, state); + if (subtree != NULL) { // child matching action name exists + if (!cJSON_IsArray(subtree)) + visit_dispatch(actions[i], subtree, state); + else { + cJSON *elem; + cJSON_ArrayForEach(elem, subtree) + visit_dispatch(actions[i], elem, state); + } } } } From 3b9a89110a6827692338fe176b36bb9f84256fc2 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 20 Jun 2024 16:54:51 -0600 Subject: [PATCH 12/29] first draft nVidia ldcache hook emulation [skip ci] --- bin/ch_json.c | 147 +++++++++++++++++++++++++++++++++++++++++++++++--- bin/ch_misc.c | 9 ++-- 2 files changed, 145 insertions(+), 11 deletions(-) diff --git a/bin/ch_json.c b/bin/ch_json.c index af7c59ac5..ac41881ff 100644 --- a/bin/ch_json.c +++ b/bin/ch_json.c @@ -19,13 +19,27 @@ /** Types **/ +/* Dispatch table row for CDI hook emulation. + + We could alternately put args last, making it a “flexible array member”. + That would make the field order slightly sub-optimal, but more importantly + it would make sizeof() return misleading results, which seems like a + nasty trap waiting for someone. */ +#define HOOK_ARG_MAX 3 +struct cdi_hook_dispatch { + size_t arg_ct; // number of arguments to compare + char *args[HOOK_ARG_MAX]; // matching arguments + void (*f)(void *, char **args); // NULL to ignore quietly +}; +#define HDF void (*)(void *, char **args) // to cast in dispatch tables + struct cdi_spec { char *kind; - char *src; // path to source spec file - bool requested; - bool ldconfig_p; + char *src; // path to source spec file + bool requested; // true if user asked for this device kind struct env_var *envs; struct bind *binds; + char **ldconfig_dirs; // directories to process with ldconfig(8) }; struct json_dispatch { @@ -33,7 +47,7 @@ struct json_dispatch { struct json_dispatch *children; void (*f)(cJSON *tree, void *state); }; -#define JDF void (*)(cJSON *, void *) /* to cast callbacks in dispatch tables */ +#define JDF void (*)(cJSON *, void *) // to cast callbacks in dispatch tables /** Constants **/ @@ -44,8 +58,11 @@ const size_t READ_SZ = 16384; /** Function prototypes (private) **/ +char **array_strings_json_to_c(cJSON *jarry, size_t *ct); void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new); void cdi_free(struct cdi_spec *spec); +void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args); +char *cdi_hook_to_string(const char *hook_name, char **args); void cdi_log(struct cdi_spec *spec); struct cdi_spec *cdi_read(const char *path); void visit(struct json_dispatch actions[], cJSON *tree, void *state); @@ -54,6 +71,7 @@ void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); // parser callbacks void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec); void cdiPC_env(cJSON *tree, struct cdi_spec *spec); +void cdiPC_hook(cJSON *tree, struct cdi_spec *spec); void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); @@ -71,6 +89,7 @@ void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); such distributed boilerplate seemed worse. */ struct json_dispatch cdiPD_containerEdits[] = { { "env", NULL, (JDF)cdiPC_env }, + { "hooks", NULL, (JDF)cdiPC_hook }, { } }; struct json_dispatch cdiPD_root[] = { @@ -80,9 +99,47 @@ struct json_dispatch cdiPD_root[] = { { } }; +/* CDI hook dispatch table. */ +struct cdi_hook_dispatch cdi_hooks[] = { + { 2, { "nvidia-ctk-hook", "update-ldcache" }, (HDF)cdi_hook_nv_ldcache }, + { 3, { "nvidia-ctk", "hook", "update-ldcache" }, (HDF)cdi_hook_nv_ldcache }, + { 2, { "nvidia-ctk-hook", "chmod" }, NULL }, + { 3, { "nvidia-ctk", "hook", "chmod" }, NULL }, + { 2, { "nvidia-ctk-hook", "create-symlinks" }, NULL }, + { 3, { "nvidia-ctk", "hook", "create-symlinks" }, NULL }, + { } +}; + /** Functions **/ + +/* Given JSON array of strings jar, which may be of length zero, convert it to + a freshly allocated NULL-terminated array of C strings (pointers to + null-terminated chars buffers) and return that. ct is an out parameter + + WARNING: This is a shallow copy, i.e., the actual strings are still owned + by the JSON array. */ +char **array_strings_json_to_c(cJSON *jarry, size_t *ct) +{ + size_t i; + char **carry; + cJSON *j; + + Tf (cJSON_IsArray(jarry), "JSON: expected array"); + *ct = cJSON_GetArraySize(jarry); + T_ (carry = malloc((*ct + 1) * sizeof(char *))); + carry[*ct] = NULL; + + i = 0; + cJSON_ArrayForEach(j, jarry) { + Tf (cJSON_IsString(j), "JSON: expected string"); + carry[i++] = j->valuestring; + } + + return carry; +} + /* Add spec to the given list of CDI specs, which is an out parameter. If we’ve seen the spec’s kind before, replace the existing spec with the same kind. Otherwise, append the new spec. */ @@ -111,9 +168,43 @@ void cdi_free(struct cdi_spec *spec) free(spec->envs[i].value); } free(spec->envs); + for (size_t i = 0; spec->ldconfig_dirs[i] != NULL; i++) + free(spec->ldconfig_dirs[i]); + free(spec->ldconfig_dirs); free(spec); } +void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args) +{ + for (size_t i = 0; args[i] != NULL; i++) + if (!strcmp("--folder", args[i])) { + char *dir; + T_ (args[i+1] != NULL); + T_ (dir = strdup(args[i+1])); + // FIXME: YOU ARE HERE: APPEND ONLY IF WE DON'T ALREADY HAVE DIR + list_append((void **)&spec->ldconfig_dirs, &dir, sizeof(dir)); + i++; + } +} + +/* Return a freshly allocated string describing the given hook, for logging. */ +char *cdi_hook_to_string(const char *hook_name, char **args) +{ + char *ret, *args_str; + + args_str = strdup(""); + for (size_t i = 0; args[i] != NULL; i++) { + char *as_old = args_str; + T_ (1 <= asprintf(&args_str, "%s %s", as_old, args[i])); + free(as_old); + } + + T_ (1 <= asprintf(&ret, "%s:%s", hook_name, args_str)); + + free(args_str); + return ret; +} + /* Log contents of spec. */ void cdi_log(struct cdi_spec *spec) { @@ -121,16 +212,18 @@ void cdi_log(struct cdi_spec *spec) DEBUG("CDI: %s from %s:", spec->kind, spec->src); DEBUG("CDI: devices requested: %s", bool_to_string(spec->requested)); - DEBUG("CDI: ldconfig(8) needed: %s", bool_to_string(spec->ldconfig_p)); ct = list_count((void *)(spec->envs), sizeof(struct env_var)); DEBUG("CDI: environment: %d:", ct); for (size_t i = 0; i < ct; i++) DEBUG("CDI: %s=%s", spec->envs[i].name, spec->envs[i].value); ct = list_count((void *)(spec->binds), sizeof(struct bind)); DEBUG("CDI: bind mounts: %d:", ct); - for (size_t i = 0; i < ct; i++) { + for (size_t i = 0; i < ct; i++) DEBUG("CDI: %s -> %s", spec->binds[i].src, spec->binds[i].dst); - } + ct = list_count((void *)(spec->ldconfig_dirs), sizeof(char *)); + DEBUG("CDI: ldconfig directories: %d:", ct); + for (size_t i = 0; i < ct; i++) + DEBUG("CDI: %s", spec->ldconfig_dirs[i]); } /* Read and parse the CDI spec file at path. Return a pointer to the parsed @@ -237,6 +330,46 @@ void cdiPC_env(cJSON *tree, struct cdi_spec *spec) list_append((void **)&spec->envs, &ev, sizeof(ev)); } +void cdiPC_hook(cJSON *tree, struct cdi_spec *spec) +{ + char **args; + size_t arg_ct; + char *hook_name; + char *hook_str; + bool hook_known; + //struct cdi_hook_dispatch hook; + + T_ (hook_name = cJSON_GetStringValue(cJSON_GetObjectItem(tree, "hookName"))); + + T_ (cJSON_IsArray(cJSON_GetObjectItem(tree, "args"))); + args = array_strings_json_to_c(cJSON_GetObjectItem(tree, "args"), &arg_ct); + hook_str = cdi_hook_to_string(hook_name, args); + + hook_known = false; + for (size_t i = 0; cdi_hooks[i].arg_ct != 0; i++) { // for each table row + if (arg_ct >= cdi_hooks[i].arg_ct) { // enough hook args to compare + for (size_t j = 0; j < cdi_hooks[i].arg_ct; j++) + if (strcmp(args[j], cdi_hooks[i].args[j])) + goto continue_outer; + hook_known = true; // all words matched + if (cdi_hooks[i].f == NULL) { + DEBUG("CDI: ignoring known hook: %s", hook_str); + } else { + DEBUG("CDI: emulating known hook: %s", hook_str); + cdi_hooks[i].f(spec, &args[cdi_hooks[i].arg_ct]); + } + break; // only call one hook function + } + continue_outer: + } + + if (!hook_known) + WARNING("CDI: ignoring unknown hook: %s", hook_str); + + free(hook_str); + free(args); +} + void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) { T_ (spec->kind = strdup(tree->valuestring)); diff --git a/bin/ch_misc.c b/bin/ch_misc.c index 6c60d9a40..f36abe1f5 100644 --- a/bin/ch_misc.c +++ b/bin/ch_misc.c @@ -44,12 +44,13 @@ code somehow. [1]: https://stackoverflow.com/a/3219471 */ -static const char COLOUR_CYAN_DARK[] = ""; -static const char COLOUR_CYAN_LIGHT[] = ""; -static const char COLOUR_RED[] = ""; +static const char COLOUR_CYAN_DARK[] = ""; +static const char COLOUR_CYAN_LIGHT[] = ""; +//static const char COLOUR_GRAY[] = ""; +static const char COLOUR_RED[] = ""; static const char COLOUR_RED_BOLD[] = ""; static const char COLOUR_RESET[] = ""; -static const char COLOUR_YELLOW[] = ""; +static const char COLOUR_YELLOW[] = ""; static const char *_LL_COLOURS[] = { COLOUR_RED_BOLD, // fatal COLOUR_RED_BOLD, // stderr COLOUR_RED, // warning From edc2e46b5497546a379b207d137b3ae93b92babd Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 21 Jun 2024 16:48:52 -0600 Subject: [PATCH 13/29] rm ch_ prefix from C sources [skip ci] --- bin/Makefile.am | 8 ++++---- bin/ch-checkns.c | 2 +- bin/ch-run.c | 6 +++--- bin/{ch_core.c => core.c} | 6 +++--- bin/{ch_core.h => core.h} | 0 bin/{ch_fuse.c => fuse.c} | 6 +++--- bin/{ch_fuse.h => fuse.h} | 0 bin/{ch_json.c => json.c} | 6 +++--- bin/{ch_json.h => json.h} | 4 ++-- bin/{ch_misc.c => misc.c} | 2 +- bin/{ch_misc.h => misc.h} | 0 configure.ac | 2 +- doc/dev.rst | 22 +++++++++++----------- doc/faq.rst | 2 +- 14 files changed, 33 insertions(+), 33 deletions(-) rename bin/{ch_core.c => core.c} (99%) rename bin/{ch_core.h => core.h} (100%) rename bin/{ch_fuse.c => fuse.c} (99%) rename bin/{ch_fuse.h => fuse.h} (100%) rename bin/{ch_json.c => json.c} (99%) rename bin/{ch_json.h => json.h} (88%) rename bin/{ch_misc.c => misc.c} (99%) rename bin/{ch_misc.h => misc.h} (100%) diff --git a/bin/Makefile.am b/bin/Makefile.am index 72fad95d0..613f78cd0 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -6,14 +6,14 @@ bin_PROGRAMS = ch-checkns ch-run -ch_checkns_SOURCES = ch-checkns.c ch_misc.h ch_misc.c +ch_checkns_SOURCES = ch-checkns.c misc.h misc.c -ch_run_SOURCES = ch-run.c ch_core.h ch_core.c ch_misc.h ch_misc.c +ch_run_SOURCES = ch-run.c core.h core.c misc.h misc.c if HAVE_JSON -ch_run_SOURCES += ch_json.h ch_json.c +ch_run_SOURCES += json.h json.c endif if HAVE_LIBSQUASHFUSE -ch_run_SOURCES += ch_fuse.h ch_fuse.c +ch_run_SOURCES += fuse.h fuse.c endif # additional build flags for ch-run diff --git a/bin/ch-checkns.c b/bin/ch-checkns.c index 10f26969a..6bc18134e 100644 --- a/bin/ch-checkns.c +++ b/bin/ch-checkns.c @@ -49,7 +49,7 @@ #include #include "config.h" -#include "ch_misc.h" +#include "misc.h" const char usage[] = "\ diff --git a/bin/ch-run.c b/bin/ch-run.c index b6965125a..b8f5a9f89 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -13,11 +13,11 @@ #include #include "config.h" -#include "ch_core.h" +#include "core.h" #ifdef HAVE_JSON -#include "ch_json.h" +#include "json.h" #endif -#include "ch_misc.h" +#include "misc.h" /** Types **/ diff --git a/bin/ch_core.c b/bin/core.c similarity index 99% rename from bin/ch_core.c rename to bin/core.c index 3850dbfa2..34e249714 100644 --- a/bin/ch_core.c +++ b/bin/core.c @@ -29,10 +29,10 @@ #include #include -#include "ch_misc.h" -#include "ch_core.h" +#include "misc.h" +#include "core.h" #ifdef HAVE_LIBSQUASHFUSE -#include "ch_fuse.h" +#include "fuse.h" #endif diff --git a/bin/ch_core.h b/bin/core.h similarity index 100% rename from bin/ch_core.h rename to bin/core.h diff --git a/bin/ch_fuse.c b/bin/fuse.c similarity index 99% rename from bin/ch_fuse.c rename to bin/fuse.c index ce60bbcc7..164b8b16e 100644 --- a/bin/ch_fuse.c +++ b/bin/fuse.c @@ -36,9 +36,9 @@ #include #include "config.h" -#include "ch_core.h" -#include "ch_fuse.h" -#include "ch_misc.h" +#include "core.h" +#include "fuse.h" +#include "misc.h" /** Types **/ diff --git a/bin/ch_fuse.h b/bin/fuse.h similarity index 100% rename from bin/ch_fuse.h rename to bin/fuse.h diff --git a/bin/ch_json.c b/bin/json.c similarity index 99% rename from bin/ch_json.c rename to bin/json.c index ac41881ff..ca80a4613 100644 --- a/bin/ch_json.c +++ b/bin/json.c @@ -9,9 +9,9 @@ #include CJSON_H -#include "ch_core.h" -#include "ch_json.h" -#include "ch_misc.h" +#include "core.h" +#include "json.h" +#include "misc.h" /** Macros **/ diff --git a/bin/ch_json.h b/bin/json.h similarity index 88% rename from bin/ch_json.h rename to bin/json.h index 20329fe19..139879536 100644 --- a/bin/ch_json.h +++ b/bin/json.h @@ -7,8 +7,8 @@ #pragma once #include "config.h" -#include "ch_core.h" -#include "ch_misc.h" +#include "core.h" +#include "misc.h" #include CJSON_H diff --git a/bin/ch_misc.c b/bin/misc.c similarity index 99% rename from bin/ch_misc.c rename to bin/misc.c index f36abe1f5..e10b3a93a 100644 --- a/bin/ch_misc.c +++ b/bin/misc.c @@ -17,7 +17,7 @@ #include #include "config.h" -#include "ch_misc.h" +#include "misc.h" /** Macros **/ diff --git a/bin/ch_misc.h b/bin/misc.h similarity index 100% rename from bin/ch_misc.h rename to bin/misc.h diff --git a/configure.ac b/configure.ac index f15cc6dfd..10c917f9e 100644 --- a/configure.ac +++ b/configure.ac @@ -585,7 +585,7 @@ AS_IF([test $want_libsquashfuse = yes], [ [have_ll_h=no], [#define SQFS_CONFIG_H #define FUSE_USE_VERSION 32 - ]) # see comment in ch_fuse.c regarding these defines + ]) # see comment in fuse.c regarding these defines ], [have_libfuse3=no]) ]) diff --git a/doc/dev.rst b/doc/dev.rst index 4a3f53284..8458274b0 100644 --- a/doc/dev.rst +++ b/doc/dev.rst @@ -1054,19 +1054,19 @@ computed, but it’s all in raw hex and hard to interpret, e.g.:: $ ch-run --seccomp -vv alpine:3.17 -- true [...] - ch-run[62763]: seccomp: arch c00000b7: found 13 syscalls (ch_core.c:582) - ch-run[62763]: seccomp: arch 40000028: found 27 syscalls (ch_core.c:582) + ch-run[62763]: seccomp: arch c00000b7: found 13 syscalls (core.c:582) + ch-run[62763]: seccomp: arch 40000028: found 27 syscalls (core.c:582) [...] - ch-run[62763]: seccomp(2) program has 156 instructions (ch_core.c:591) - ch-run[62763]: 0: { op=20 k= 4 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 1: { op=15 k=c00000b7 jt= 0 jf= 17 } (ch_core.c:423) - ch-run[62763]: 2: { op=20 k= 0 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 3: { op=15 k= 5b jt=145 jf= 0 } (ch_core.c:423) + ch-run[62763]: seccomp(2) program has 156 instructions (core.c:591) + ch-run[62763]: 0: { op=20 k= 4 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 1: { op=15 k=c00000b7 jt= 0 jf= 17 } (core.c:423) + ch-run[62763]: 2: { op=20 k= 0 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 3: { op=15 k= 5b jt=145 jf= 0 } (core.c:423) [...] - ch-run[62763]: 154: { op= 6 k=7fff0000 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 155: { op= 6 k= 50000 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: note: see FAQ to disassemble the above (ch_core.c:676) - ch-run[62763]: executing: true (ch_core.c:538) + ch-run[62763]: 154: { op= 6 k=7fff0000 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 155: { op= 6 k= 50000 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: note: see FAQ to disassemble the above (core.c:676) + ch-run[62763]: executing: true (core.c:538) You can instead use `seccomp-tools `_ to disassemble and pretty-print diff --git a/doc/faq.rst b/doc/faq.rst index 83ba73e8e..c30c29101 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -201,7 +201,7 @@ handling. For example:: $ ch-run /var/tmp/hello -- /bin/echo foo - ch-run[154334]: error: can’t execve(2): /bin/echo: Permission denied (ch_core.c:387 13) + ch-run[154334]: error: can’t execve(2): /bin/echo: Permission denied (core.c:387 13) But :code:`/bin/echo` *does* have execute permission:: From 03ef95ecb767c821625e679b702c79710a8af137 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 9 Jul 2024 16:27:40 -0600 Subject: [PATCH 14/29] snapshot --- .gitignore | 1 + bin/ch-run.c | 14 +++++++++----- bin/core.h | 5 +++-- bin/json.c | 12 +++++++++--- 4 files changed, 22 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index cee56d28d..4af822fca 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ a.out /charliecloud-*/ # debugging crap +core /build-cache.gv /build-cache.pdf diff --git a/bin/ch-run.c b/bin/ch-run.c index b8f5a9f89..b9aeb5702 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -24,10 +24,10 @@ struct args { struct container c; - struct env_delta *env_deltas; #ifdef HAVE_JSON - char ** cdi_devids; + char **cdi_devids; #endif + struct env_delta *env_deltas; enum log_color_when log_color; enum log_test log_test; char *initial_dir; @@ -193,6 +193,7 @@ int main(int argc, char *argv[]) .env_expand = true, .host_home = NULL, .img_ref = NULL, + .ldconfigs = list_new(sizeof(char *), 0), .newroot = NULL, .join = false, .join_ct = 0, @@ -203,11 +204,13 @@ int main(int argc, char *argv[]) .private_tmp = false, .type = IMG_NONE, .writable = false }, +#ifdef HAVE_JSON .cdi_devids = list_new(sizeof(char *), 0), - .log_color = LL_COLOR_AUTO, - .log_test = LL_TEST_NONE, +#endif .env_deltas = list_new(sizeof(struct env_delta), 0), .initial_dir = NULL, + .log_color = LL_COLOR_AUTO, + .log_test = LL_TEST_NONE, #ifdef HAVE_SECCOMP .seccomp_p = false, #endif @@ -288,7 +291,8 @@ int main(int argc, char *argv[]) cdi_update(&args.c, args.cdi_devids); containerize(&args.c); - fix_environment(&args); + fix_environment(&args); // FIXME -- fold into hooks_prestart()? + //hooks_prestart(&args.c); #ifdef HAVE_SECCOMP if (args.seccomp_p) seccomp_install(); diff --git a/bin/core.h b/bin/core.h index d6ba1fb75..9ab8d5626 100644 --- a/bin/core.h +++ b/bin/core.h @@ -37,14 +37,15 @@ struct container { bool env_expand; // expand variables in --set-env char *host_home; // if --home, host path to user homedir, else NULL char *img_ref; // image description from command line + char **ldconfigs; // directories to pass to image’s ldconfig(8) char *newroot; // path to new root directory bool join; // is this a synchronized join? int join_ct; // number of peers in a synchronized join pid_t join_pid; // process in existing namespace to join char *join_tag; // identifier for synchronized join char *overlay_size; // size of overlaid tmpfs (NULL for no overlay) - bool private_passwd; // don't bind custom /etc/{passwd,group} - bool private_tmp; // don't bind host's /tmp + bool private_passwd; // don’t bind custom /etc/{passwd,group} + bool private_tmp; // don’t bind host's /tmp enum img_type type; // directory, SquashFS, etc. bool writable; // re-mount image read-write }; diff --git a/bin/json.c b/bin/json.c index ca80a4613..b1de98b0c 100644 --- a/bin/json.c +++ b/bin/json.c @@ -287,15 +287,21 @@ void cdi_update(struct container *c, char **devids) cdi_add(&specs, spec); } + // filter device kinds to those requested + // debugging: print parsed CDI specs for (size_t i = 0; specs[i] != NULL; i++) cdi_log(specs[0]); - // filter device kinds to those requested - // figure out bind mounts actually needed and set up symlinks - // set ldconfig bit + // update c + list_join( + // + // lconfigs -- copy rather than assigning b/c (1) easier to free and (2) + // still works if we later grow other sources of ldconfig + // + // need list_join() // clean up for (size_t i = 0; specs[i] != NULL; i++) From eb00188f2737d4e6467b2c18baab2c91947c7897 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 12 Jul 2024 16:17:35 -0600 Subject: [PATCH 15/29] uniqify spec list [skip ci] --- bin/json.c | 74 ++++++++++++++++++++++++---------------------------- bin/misc.c | 50 ++++++++++++++++++++++++++++++++--- bin/misc.h | 6 +++++ configure.ac | 22 +++++++++++----- 4 files changed, 102 insertions(+), 50 deletions(-) diff --git a/bin/json.c b/bin/json.c index b1de98b0c..a9122ed77 100644 --- a/bin/json.c +++ b/bin/json.c @@ -36,10 +36,9 @@ struct cdi_hook_dispatch { struct cdi_spec { char *kind; char *src; // path to source spec file - bool requested; // true if user asked for this device kind struct env_var *envs; struct bind *binds; - char **ldconfig_dirs; // directories to process with ldconfig(8) + char **ldconfigs; // directories to process with ldconfig(8) }; struct json_dispatch { @@ -59,7 +58,7 @@ const size_t READ_SZ = 16384; /** Function prototypes (private) **/ char **array_strings_json_to_c(cJSON *jarry, size_t *ct); -void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new); +int cdi_cmp_kind(const void *a, const void *b); void cdi_free(struct cdi_spec *spec); void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args); char *cdi_hook_to_string(const char *hook_name, char **args); @@ -140,22 +139,18 @@ char **array_strings_json_to_c(cJSON *jarry, size_t *ct) return carry; } -/* Add spec to the given list of CDI specs, which is an out parameter. If - we’ve seen the spec’s kind before, replace the existing spec with the same - kind. Otherwise, append the new spec. */ -void cdi_add(struct cdi_spec ***specs, struct cdi_spec *spec_new) +/* Compare the kinds of specifications a and b (which are double pointers, + hence the hairy casts). As expected by qsort(3): + + if a < b: return negative value + if a = b: return 0 + if a > b: return positive value */ +int cdi_cmp_kind(const void *a, const void *b) { - if (*specs != NULL) - for (size_t i = 0; (*specs)[i] != NULL; i++) - if (!strcmp((*specs)[i]->kind, spec_new->kind)) { - DEBUG("CDI: spec %s: replacing at %d", spec_new->kind, i); - free((*specs)[i]); - *specs[i] = spec_new; - return; - } - // don’t alread have the kind if we got through the loop - DEBUG("CDI: spec %s: new", spec_new->kind); - list_append((void **)specs, &spec_new, sizeof(spec_new)); + struct cdi_spec *a_ = *(struct cdi_spec **)a; + struct cdi_spec *b_ = *(struct cdi_spec **)b; + + return strcmp(a_->kind, b_->kind); } /* Free spec. */ @@ -168,9 +163,9 @@ void cdi_free(struct cdi_spec *spec) free(spec->envs[i].value); } free(spec->envs); - for (size_t i = 0; spec->ldconfig_dirs[i] != NULL; i++) - free(spec->ldconfig_dirs[i]); - free(spec->ldconfig_dirs); + for (size_t i = 0; spec->ldconfigs[i] != NULL; i++) + free(spec->ldconfigs[i]); + free(spec->ldconfigs); free(spec); } @@ -182,7 +177,7 @@ void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args) T_ (args[i+1] != NULL); T_ (dir = strdup(args[i+1])); // FIXME: YOU ARE HERE: APPEND ONLY IF WE DON'T ALREADY HAVE DIR - list_append((void **)&spec->ldconfig_dirs, &dir, sizeof(dir)); + list_append((void **)&spec->ldconfigs, &dir, sizeof(dir)); i++; } } @@ -211,7 +206,6 @@ void cdi_log(struct cdi_spec *spec) size_t ct; DEBUG("CDI: %s from %s:", spec->kind, spec->src); - DEBUG("CDI: devices requested: %s", bool_to_string(spec->requested)); ct = list_count((void *)(spec->envs), sizeof(struct env_var)); DEBUG("CDI: environment: %d:", ct); for (size_t i = 0; i < ct; i++) @@ -220,10 +214,10 @@ void cdi_log(struct cdi_spec *spec) DEBUG("CDI: bind mounts: %d:", ct); for (size_t i = 0; i < ct; i++) DEBUG("CDI: %s -> %s", spec->binds[i].src, spec->binds[i].dst); - ct = list_count((void *)(spec->ldconfig_dirs), sizeof(char *)); + ct = list_count((void *)(spec->ldconfigs), sizeof(char *)); DEBUG("CDI: ldconfig directories: %d:", ct); for (size_t i = 0; i < ct; i++) - DEBUG("CDI: %s", spec->ldconfig_dirs[i]); + DEBUG("CDI: %s", spec->ldconfigs[i]); } /* Read and parse the CDI spec file at path. Return a pointer to the parsed @@ -274,34 +268,34 @@ struct cdi_spec *cdi_read(const char *path) bind mounts) happens later. */ void cdi_update(struct container *c, char **devids) { - struct cdi_spec *spec; - struct cdi_spec **specs = NULL; + struct cdi_spec **specs = list_new(sizeof(struct cdi_spec *), 12); - // read CDI spec files in configured directories + // read CDI spec files in configured directories, if requested + // FIXME // read CDI spec files specifically requested for (size_t i = 0; devids[i] != NULL; i++) if (devids[i][0] == '.' || devids[i][0] == '/') { - spec = cdi_read(devids[i]); - spec->requested = true; - cdi_add(&specs, spec); + struct cdi_spec *spec = cdi_read(devids[i]); + list_append((void **)&specs, &spec, sizeof(spec)); } - // filter device kinds to those requested + // rm duplicate kinds + DEBUG("CDI: read %d specs", list_count(specs, sizeof(specs[0]))); + list_uniq(specs, sizeof(specs[0]), cdi_cmp_kind); // debugging: print parsed CDI specs + DEBUG("CDI: using %d specs", list_count(specs, sizeof(specs[0]))); for (size_t i = 0; specs[i] != NULL; i++) cdi_log(specs[0]); - // figure out bind mounts actually needed and set up symlinks - // update c - list_join( - // - // lconfigs -- copy rather than assigning b/c (1) easier to free and (2) - // still works if we later grow other sources of ldconfig - // - // need list_join() + for (size_t i = 0; specs[i] != NULL; i++) { + // ldconfigs; copy rather than assigning because (1) easier to free + // and (2) still works if we later grow other sources of ldconfig. + list_cat((void **)&c->ldconfigs, (void *)specs[i]->ldconfigs, + sizeof(c->ldconfigs[0])); + } // clean up for (size_t i = 0; specs[i] != NULL; i++) diff --git a/bin/misc.c b/bin/misc.c index e10b3a93a..b81c10f5f 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -426,13 +426,26 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) [1]: http://www.c-faq.com/ptrs/genericpp.html */ void list_append(void **ar, void *new, size_t size) { - int ct; + size_t ct; T_ (new != NULL); ct = list_count(*ar, size); - T_ (*ar = realloc(*ar, (ct+2)*size)); // existing + new + terminator - memcpy((char *)*ar + ct*size, new, size); // append new (no overlap) - memset((char *)*ar + (ct+1)*size, 0, size); // set new terminator + T_ (*ar = realloc(*ar, (ct+2)*size)); // existing + new + terminator + memcpy(*ar + ct*size, new, size); // append new (no overlap) + memset(*ar + (ct+1)*size, 0, size); // set new terminator +} + +/* Copy the contents of list src onto the end of dest. */ +void list_cat(void **dst, void *src, size_t size) +{ + size_t ct_dst, ct_src; + T_ (src != NULL); + + ct_dst = list_count(*dst, size); + ct_src = list_count(src, size); + T_ (*dst = realloc(*dst, (ct_dst+ct_src+1)*size)); + memcpy(*dst + ct_dst*size, src, ct_src*size); // append src (no overlap) + memset(*dst + (ct_dst+ct_src)*size, 0, size); // set new terminator } /* Return the number of elements of size size in list *ar. */ @@ -461,6 +474,35 @@ void *list_new(size_t size, size_t ct) return list; } +/* Remove any duplicate elements in ar, in-place, according to comparison + function cmp. The last duplicate in the list wins. Preserves order + otherwise. */ +void list_uniq(void *ar, size_t size, comparison_fn_t cmp) +{ + size_t rm_ct; + size_t ct_starting = list_count(ar, size); + void *zero_blk = ar + ct_starting * size; // assumes terminated correctly + + // Loop backwards through array; set duplicates to zero. We could instead + // bubble out the duplicates here, but I felt keeping track of indices + // would be too hard. + for (int i = ct_starting - 1; i > 0; i--) { // ar[0] has nothing prior + if (memcmp(ar + i * size, zero_blk, size)) // if not already deleted + for (int j = i - 1; j >= 0; j--) + if (!cmp(ar + i * size, ar + j * size)) + memset(ar + j * size, 0, size); + } + // Loop forwards through array, shifting each item backwards the number of + // zero blocks we’ve seen so far. + rm_ct = 0; + for (int i = 0; i < ct_starting; i++) + if (!memcmp(ar + i * size, zero_blk, size)) // ar[i] deleted + rm_ct++; + else if (rm_ct > 0) + memcpy(ar + (i - rm_ct) * size, ar + i * size, size); + memset(ar + (ct_starting - rm_ct) * size, 0, size); // terminate +} + /* If verbose enough, print uids and gids on stderr prefixed with where. FIXME: Should change to DEBUG(), but that will give the file/line within diff --git a/bin/misc.h b/bin/misc.h index bfb8791a5..37c354216 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -78,6 +78,10 @@ /** Types **/ +#ifndef HAVE_COMPARISON_FN_T +typedef int (*comparison_fn_t) (const void *, const void *); +#endif + enum env_action { ENV_END = 0, // terminate list of environment changes ENV_SET_DEFAULT, // set by /ch/environment within image ENV_SET_VARS, // set by list of variables @@ -140,8 +144,10 @@ void env_set(const char *name, const char *value, const bool expand); void env_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); +void list_cat(void **dst, void *src, size_t size); size_t list_count(void *ar, size_t size); void *list_new(size_t size, size_t ct); +void list_uniq(void *ar, size_t size, comparison_fn_t cmp); void log_ids(const char *func, int line); void logging_init(enum log_color_when when, enum log_test test); void test_logging(bool fail); diff --git a/configure.ac b/configure.ac index 10c917f9e..9bedac152 100644 --- a/configure.ac +++ b/configure.ac @@ -473,6 +473,13 @@ AC_CHECK_DECL(FNM_EXTMATCH, [[#define _GNU_SOURCE #include ]]) +# GNU extension type for the comparison function argument to qsort(3). +AC_CHECK_TYPE(comparison_fn_t, + [have_comparison_fn_t=yes], + [have_comparison_fn_t=no], + [[#define _GNU_SOURCE + #include ]]) + # cJSON. Note that we don’t try to ensure the header we find matches the # library we find. Hopefully that’s not a problem. AS_IF([test $want_json = yes], [ @@ -859,14 +866,10 @@ AC_SUBST([CH_RUN_LIBS]) AC_SUBST([PYTHON_SHEBANG]) AC_SUBST([SPHINX]) -AS_IF([test $have_overlayfs = yes], - [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) -AS_IF([test $have_tmpfs_xattrs = yes], - [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) +AS_IF([test $have_comparison_fn_t = yes], + [AC_DEFINE([HAVE_COMPARISON_FN_T], [1], [comp. function for qsort(3)])]) AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) -AS_IF([test $have_seccomp = yes], - [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) AM_CONDITIONAL([HAVE_JSON], [test $have_json = yes]) AS_IF([test $have_json = yes], [AC_DEFINE([HAVE_JSON], [1], [enable JSON features]) @@ -874,6 +877,12 @@ AS_IF([test $have_json = yes], AM_CONDITIONAL([HAVE_LIBSQUASHFUSE], [test $have_libsquashfuse = yes]) AS_IF([test $have_libsquashfuse = yes], [AC_DEFINE([HAVE_LIBSQUASHFUSE], [1], [link with libsquashfuse])]) +AS_IF([test $have_overlayfs = yes], + [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) +AS_IF([test $have_seccomp = yes], + [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) +AS_IF([test $have_tmpfs_xattrs = yes], + [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) @@ -1036,6 +1045,7 @@ Building Charliecloud optional: extended glob patterns in --unset-env ... ${have_fnm_extmatch} + comparison_fn_t from libc ... ${have_comparison_fn_t} JSON features: ${have_json} enabled ... ${want_json} From b114c8df2509094f8128918f5731d78df25aa84c Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 23 Jul 2024 16:16:20 -0600 Subject: [PATCH 16/29] refactor environment variables into new hook framework (untested) [skip ci] --- bin/Makefile.am | 3 + bin/ch-run.c | 244 ++++++++++++++++++++++++++++-------------------- bin/core.c | 13 +++ bin/core.h | 16 +++- bin/hook.c | 50 ++++++++++ bin/hook.h | 24 +++++ bin/misc.c | 47 ++++++---- bin/misc.h | 15 +-- bin/rootemu.h | 10 ++ bin/seccomp.h | 10 ++ doc/ch-run.rst | 86 +++++++++-------- 11 files changed, 340 insertions(+), 178 deletions(-) create mode 100644 bin/hook.c create mode 100644 bin/hook.h create mode 100644 bin/rootemu.h create mode 100644 bin/seccomp.h diff --git a/bin/Makefile.am b/bin/Makefile.am index 613f78cd0..8dee51c5e 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -15,6 +15,9 @@ endif if HAVE_LIBSQUASHFUSE ch_run_SOURCES += fuse.h fuse.c endif +if HAVE_SECCOMP +ch_run_SOURCES += seccomp.h seccomp.c +endif # additional build flags for ch-run ch_run_CFLAGS = $(PTHREAD_CFLAGS) diff --git a/bin/ch-run.c b/bin/ch-run.c index b9aeb5702..340d1a203 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -22,12 +22,25 @@ /** Types **/ +enum env_option_type { + ENV_END = 0, // list terminator sentinel + ENV_SET, // --set-env + ENV_SET0, // --set-env0 + ENV_UNSET, // --unset-env + ENV_CDI_DEV, // --device +}; + +struct env_option { + enum env_option_type opt; + char *arg; +}; + struct args { struct container c; #ifdef HAVE_JSON char **cdi_devids; #endif - struct env_delta *env_deltas; + struct env_option *env_options; enum log_color_when log_color; enum log_test log_test; char *initial_dir; @@ -144,9 +157,10 @@ bool get_first_env(char **array, char **name, char **value); void img_directory_verify(const char *img_path, const struct args *args); int join_ct(int cli_ct); char *join_tag(char *cli_tag); +void parse_env(struct env_option **options, enum env_option_type opt, + const char *arg); int parse_int(char *s, bool extra_ok, char *error_tag); static error_t parse_opt(int key, char *arg, struct argp_state *state); -void parse_set_env(struct args *args, char *arg, int delim); void privs_verify_invoking(); char *storage_default(void); void write_fake_enable(struct args *args, char *overlay_size); @@ -187,27 +201,30 @@ int main(int argc, char *argv[]) verbose = LL_INFO; // in ch_misc.c args = (struct args){ - .c = (struct container){ .binds = list_new(sizeof(struct bind), 0), - .container_gid = getegid(), - .container_uid = geteuid(), - .env_expand = true, - .host_home = NULL, - .img_ref = NULL, - .ldconfigs = list_new(sizeof(char *), 0), - .newroot = NULL, - .join = false, - .join_ct = 0, - .join_pid = 0, - .join_tag = NULL, - .overlay_size = NULL, - .private_passwd = false, - .private_tmp = false, - .type = IMG_NONE, - .writable = false }, + .c = (struct container){ + .binds = list_new(sizeof(struct bind), 0), + .container_gid = getegid(), + .container_uid = geteuid(), + .env_expand = true, + .hooks_prestart = list_new(sizeof(struct hook), 0); + .host_home = NULL, + .img_ref = NULL, + .ldconfigs = list_new(sizeof(char *), 0), + .newroot = NULL, + .join = false, + .join_ct = 0, + .join_pid = 0, + .join_tag = NULL, + .overlay_size = NULL, + .private_passwd = false, + .private_tmp = false, + .type = IMG_NONE, + .writable = false + }, #ifdef HAVE_JSON .cdi_devids = list_new(sizeof(char *), 0), #endif - .env_deltas = list_new(sizeof(struct env_delta), 0), + .env_options = list_new(sizeof(struct env_option), 0), .initial_dir = NULL, .log_color = LL_COLOR_AUTO, .log_test = LL_TEST_NONE, @@ -215,7 +232,8 @@ int main(int argc, char *argv[]) .seccomp_p = false, #endif .storage_dir = storage_default(), - .unsafe = false }; + .unsafe = false + }; /* I couldn't find a way to set argp help defaults other than this environment variable. Kludge sets/unsets only if not already set. */ @@ -229,6 +247,7 @@ int main(int argc, char *argv[]) if (!argp_help_fmt_set) Z_ (unsetenv("ARGP_HELP_FMT")); logging_init(args.log_color, args.log_test); + env_hooks_install(&args); if (arg_next >= argc - 1) { printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); @@ -291,86 +310,119 @@ int main(int argc, char *argv[]) cdi_update(&args.c, args.cdi_devids); containerize(&args.c); - fix_environment(&args); // FIXME -- fold into hooks_prestart()? - //hooks_prestart(&args.c); -#ifdef HAVE_SECCOMP - if (args.seccomp_p) - seccomp_install(); -#endif - // run_command(ldconfig, true, NULL); // FIXME - run_user_command(c_argv, args.initial_dir); // should never return + hooks_prestart(&args.c); + run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } /** Supporting functions **/ -/* Adjust environment variables. Call once containerized, i.e., already - pivoted into new root. */ -void fix_environment(struct args *args) +/* Find the first environment variable in array that is set; put its name in + *name and its value in *value, and return true. If none are set, return + false, and *name and *value are undefined. */ +bool get_first_env(char **array, char **name, char **value) { - char *old_value, *new_value; + for (int i = 0; array[i] != NULL; i++) { + *name = array[i]; + *value = getenv(*name); + if (*value != NULL) + return true; + } + + return false; +} + +/* Set the default environment variables that come before the user-specified + environment changes. d must be NULL. */ +void hook_envs_def_first(struct container *c, void *d) +{ + char *vnew, *vold; + T_ (d == NULL); // $HOME: If --home, set to “/home/$USER”. - if (args->c.host_home) { - Z_ (setenv("HOME", cat("/home/", username), 1)); + if (c->host_home) { + vnew = cat("/home/", username); + env_set("HOME", vnew, false); + free(vnew); } else if (path_exists("/root", NULL, true)) { - Z_ (setenv("HOME", "/root", 1)); + env_set("HOME", "/root", false); } else - Z_ (setenv("HOME", "/", 1)); + env_set("HOME", "/", false); // $PATH: Append /bin if not already present. - old_value = getenv("PATH"); - if (old_value == NULL) { + vold = getenv("PATH"); + if (vold == NULL) { WARNING("$PATH not set"); - } else if ( strstr(old_value, "/bin") != old_value - && !strstr(old_value, ":/bin")) { - T_ (1 <= asprintf(&new_value, "%s:/bin", old_value)); - Z_ (setenv("PATH", new_value, 1)); - VERBOSE("new $PATH: %s", new_value); + } else if (strstr(vold, "/bin") != vold && !strstr(vold, ":/bin")) { + T_ (1 <= asprintf(&vnew, "%s:/bin", vold)); + env_set("PATH", vnew, false); } // $TMPDIR: Unset. Z_ (unsetenv("TMPDIR")); +} + +/* Set the default environment variables that come after the user-specified + changes. d must be NULL. */ +void hook_envs_def_last(struct container *c, void *d) +{ + T_ (d == NULL); + env_set("CH_RUNNING", "Weird Al Yankovic", false); +} - // --set-env and --unset-env. - for (size_t i = 0; args->env_deltas[i].action != ENV_END; i++) { - struct env_delta ed = args->env_deltas[i]; - switch (ed.action) { - case ENV_END: - Te (false, "unreachable code reached"); +/* Install pre-start hooks for environment variable changes. */ +void hook_envs_install(struct args *args) +{ + hook_add(&args->hooks_prestart, "env-def-first", hook_envs_def_first, NULL); + + for (size_t i = 0; args->env_options[i].opt != ENV_END; i++) { + char *name_base, *name; + hookf_t *f; + void *d; + enum env_option_type opt = args->env_options[i].opt; + char *arg = args->env_options[i].arg; + + switch (opt) { + case ENV_SET: + case ENV_SET0: + int delim = ENV_SET ? '\n' : '\0'; + if (args == NULL) { // guest path; defer file read + struct env_file *ef; + name_base = "env-set-gfile"; + f = hook_envs_set_file; + T_ (ef = malloc(sizeof struct env_file)); + ef->name = arg; + ef->delim = delim; + d = ef; + } else { + f = hook_envs_set; + if (strchr(arg, '=') == NULL) { // host path; read file now + name_base = "env-set-hfile"; + d = env_file_read(arg, delim); + } else { // direct set + name_base = "env-set-direct"; + d = list_new(sizeof(struct env_var), 1); + } + } break; - case ENV_SET_DEFAULT: - ed.arg.vars = env_file_read("/ch/environment", ed.arg.delim); - // fall through - case ENV_SET_VARS: - for (size_t j = 0; ed.arg.vars[j].name != NULL; j++) - env_set(ed.arg.vars[j].name, ed.arg.vars[j].value, - args->c.env_expand); + case ENV_UNSET: + name_base = "env-unset"; + f = hook_envs_unset; + d = arg; break; - case ENV_UNSET_GLOB: - env_unset(ed.arg.glob); + case ENV_CDI_DEV: + name_base = "env-set-cdi"; + f = hook_envs_set; + d = cdi_envs_get(arg); break; } + T_ (1 <= asprintf(&name, "%s-%d", name_base, i)); + hook_add(&args->c.hooks_prestart, name, f, d); + free(name); } - // $CH_RUNNING is not affected by --unset-env or --set-env. - Z_ (setenv("CH_RUNNING", "Weird Al Yankovic", 1)); -} - -/* Find the first environment variable in array that is set; put its name in - *name and its value in *value, and return true. If none are set, return - false, and *name and *value are undefined. */ -bool get_first_env(char **array, char **name, char **value) -{ - for (int i = 0; array[i] != NULL; i++) { - *name = array[i]; - *value = getenv(*name); - if (*value != NULL) - return true; - } - - return false; + hook_add(&args->hooks_prestart, "env-def-last", hook_envs_def_last, NULL); } /* Validate that it’s OK to run the IMG_DIRECTORY format image at path; if @@ -471,15 +523,11 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) args->c.join_pid = parse_int(arg, false, "--join-pid"); break; case -6: // --set-env - parse_set_env(args, arg, '\n'); - break; - case -7: { // --unset-env - struct env_delta ed; - Te (strlen(arg) > 0, "--unset-env: GLOB must have non-zero length"); - ed.action = ENV_UNSET_GLOB; - ed.arg.glob = arg; - list_append((void **)&(args->env_deltas), &ed, sizeof(ed)); - } break; + parse_env(&args->env_options, ENV_SET, arg); + break; + case -7: // --unset-env + parse_env(&args->env_options, ENV_UNSET, arg); + break; case -9: // --no-passwd args->c.private_passwd = true; break; @@ -537,7 +585,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; #endif case -15: // --set-env0 - parse_set_env(args, arg, '\0'); + parse_env(&args->env_options, ENV_SET0, arg); break; case -16: // --warnings for (int i = 1; i <= parse_int(arg, false, "--warnings"); i++) @@ -655,26 +703,16 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) return 0; } -void parse_set_env(struct args *args, char *arg, int delim) +void parse_env(struct env_option **options, enum env_option_type opt, + const char *arg) { - struct env_delta ed; - - if (arg == NULL) { - ed.action = ENV_SET_DEFAULT; - ed.arg.delim = delim; - } else { - ed.action = ENV_SET_VARS; - if (strchr(arg, '=') == NULL) - ed.arg.vars = env_file_read(arg, delim); - else { - ed.arg.vars = list_new(sizeof(struct env_var), 1); - ed.arg.vars[0] = env_var_parse(arg, NULL, 0); - } - } - list_append((void **)&(args->env_deltas), &ed, sizeof(ed)); + struct env_option eo = (struct env_option){ .opt = opt, + .arg = arg }; + Te (arg == NULL || strlen(arg) > 0, + "environment options: argument must have non-zero length"); + list_append((void **)env_options, &eo, sizeof(eo)); } - /* Validate that the UIDs and GIDs are appropriate for program start, and abort if not. diff --git a/bin/core.c b/bin/core.c index 34e249714..398ff7dae 100644 --- a/bin/core.c +++ b/bin/core.c @@ -385,6 +385,19 @@ void enter_udss(struct container *c) DEBUG("pivot_root(2) dance successful") } +/* Append hook function f to hook_list. When called, it will be passed d; this + lets hooks receive arbitrary arguments (i.e., it’s a poor person’s + closure). + + Warning: The hook framework does no memory management for name or d, i.e., + if name needs to be freed, that is the responsibility of the caller (this + function uses a copy), and/or if anything in d either needs to be freed, + that is the responsibility of the hook. */ +void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) +{ + // FIXME: hooks: environment variables, seccomp, CDI +} + /* Return image type of path, or exit with error if not a valid type. */ enum img_type image_type(const char *ref, const char *storage_dir) { diff --git a/bin/core.h b/bin/core.h index 9ab8d5626..4546d2496 100644 --- a/bin/core.h +++ b/bin/core.h @@ -1,6 +1,6 @@ /* Copyright © Triad National Security, LLC, and others. - This interface contains Charliecloud's core containerization features. */ + This interface contains Charliecloud’s core containerization features. */ #define _GNU_SOURCE #pragma once @@ -23,6 +23,14 @@ struct bind { enum bind_dep dep; }; +struct container; // forward declaration to avoid definition loop +typedef void (hookf_t)(struct container *, void *); +struct hook { + char *name; + hookf_t *f; + void *data; +} + enum img_type { IMG_DIRECTORY, // normal directory, perhaps an external mount of some kind IMG_SQUASH, // SquashFS archive file (not yet mounted) @@ -35,6 +43,7 @@ struct container { gid_t container_gid; // GID to use in container uid_t container_uid; // UID to use in container bool env_expand; // expand variables in --set-env + struct hook *hooks_prestart; // prestart hook functions and their arguments char *host_home; // if --home, host path to user homedir, else NULL char *img_ref; // image description from command line char **ldconfigs; // directories to pass to image’s ldconfig(8) @@ -54,9 +63,8 @@ struct container { /** Function prototypes **/ void containerize(struct container *c); +void hook_add(struct hook **hook_list, hookf_t *f, void *d); +void hooks_prestart(struct container *c); enum img_type image_type(const char *ref, const char *images_dir); char *img_name2path(const char *name, const char *storage_dir); void run_user_command(char *argv[], const char *initial_dir); -#ifdef HAVE_SECCOMP -void seccomp_install(void); -#endif diff --git a/bin/hook.c b/bin/hook.c new file mode 100644 index 000000000..c607ef18a --- /dev/null +++ b/bin/hook.c @@ -0,0 +1,50 @@ +/* Copyright © Triad National Security, LLC, and others. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" +#include "hook.h" +#include "misc.h" + + +/** Function prototypes (private) **/ + + +/** Functions **/ + +/* Set the environment variables listed in d, then free d. */ +void hook_envs_set(struct container *c, void *d) +{ + struct env_var *vars = d; + + envs_set(vars, c->env_expand); + + free(vars); +} + +/* Set the environment variables specified in file d, then free d. NOTE: + d->path is still owned by hook_envs_install()’s caller, so we do not free + that buffer. */ +void hook_envs_set_file(struct container *c, void *d) +{ + struct env_file *ef = d; + struct env_var *vars = env_file_read(d->path, d->delim); + + envs_set(vars, c->env_expand); + + free(vars); + free(ef); +} + +/* Unset the environment variables matching glob d. NOTE: d is owned by + hook_envs_install()’s caller, so we do not free it. */ +void hook_envs_unset(struct container *c, void *d) +{ + envs_unset((char *)d); +} + + +void hook_ldconfig(struct container *c, void *d) +{ +} diff --git a/bin/hook.h b/bin/hook.h new file mode 100644 index 000000000..c90151a96 --- /dev/null +++ b/bin/hook.h @@ -0,0 +1,24 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains hooks that don’t deserve their own file. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" +#include "misc.h" + + +/** Types **/ + +struct env_file { + char *path; + bool expand; +} + + +/** Function prototypes **/ + +void hook_env_set_file(struct container *c, void *d); +void hook_env_set(struct container *c, void *d); +void hook_env_unset(struct container *c, void *d); diff --git a/bin/misc.c b/bin/misc.c index b81c10f5f..c1ccba605 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -221,7 +221,8 @@ bool buf_zero_p(void *buf, size_t size) return true; } -/* Concatenate strings a and b, then return the result. */ +/* Concatenate strings a and b into a newly-allocated buffer and return the a + pointer to this buffer. */ char *cat(const char *a, const char *b) { char *ret; @@ -317,23 +318,27 @@ void env_set(const char *name, const char *value, const bool expand) bool first_written; // Walk through value fragments separated by colon and expand variables. - T_ (value_ = strdup(value)); - value_expanded = ""; - first_written = false; - while (true) { // loop executes ≥ once - char *fgmt = strsep(&value_, ":"); // NULL -> no more items - if (fgmt == NULL) - break; - if (expand && fgmt[0] == '$' && fgmt[1] != 0) { - fgmt = getenv(fgmt + 1); // NULL if unset - if (fgmt != NULL && fgmt[0] == 0) - fgmt = NULL; // convert empty to unset - } - if (fgmt != NULL) { // NULL -> omit from output - if (first_written) - value_expanded = cat(value_expanded, ":"); - value_expanded = cat(value_expanded, fgmt); - first_written = true; + if (!expand) + value_expanded = value; + else { + T_ (value_ = strdup(value)); + value_expanded = ""; + first_written = false; + while (true) { // loop executes ≥ once + char *fgmt = strsep(&value_, ":"); // NULL -> no more items + if (fgmt == NULL) + break; + if (fgmt[0] == '$' && fgmt[1] != 0) { + fgmt = getenv(fgmt + 1); // NULL if unset + if (fgmt != NULL && fgmt[0] == 0) + fgmt = NULL; // convert empty to unset + } + if (fgmt != NULL) { // NULL -> omit from output + if (first_written) + value_expanded = cat(value_expanded, ":"); + value_expanded = cat(value_expanded, fgmt); + first_written = true; + } } } @@ -342,6 +347,12 @@ void env_set(const char *name, const char *value, const bool expand) Z_ (setenv(name, value_expanded, 1)); } +void envs_set(const struct env_var *vars, const bool expand) +{ + for (size_t i = 0; vars[i].name != NULL; i++) + env_set(env_set(vars[i].name, vars[i].value, expand)); +} + /* Remove variables matching glob from the environment. This is tricky, because there is no standard library function to iterate through the environment, and the environ global array can be re-ordered after diff --git a/bin/misc.h b/bin/misc.h index 37c354216..d199226a7 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -82,25 +82,11 @@ typedef int (*comparison_fn_t) (const void *, const void *); #endif -enum env_action { ENV_END = 0, // terminate list of environment changes - ENV_SET_DEFAULT, // set by /ch/environment within image - ENV_SET_VARS, // set by list of variables - ENV_UNSET_GLOB }; // unset glob matches - struct env_var { char *name; char *value; }; -struct env_delta { - enum env_action action; - union { - int delim; // ENV_SET_DEFAULT - struct env_var *vars; // ENV_SET_VARS - char *glob; // ENV_UNSET_GLOB - } arg; -}; - enum log_level { LL_FATAL = -3, LL_STDERR = -2, LL_WARNING = -1, @@ -141,6 +127,7 @@ int dir_ls_count(const char *path); int dir_ls_filter(const struct dirent *e); struct env_var *env_file_read(const char *path, int delim); void env_set(const char *name, const char *value, const bool expand); +void envs_set(const struct env_var *envs, const bool expand); void env_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); diff --git a/bin/rootemu.h b/bin/rootemu.h new file mode 100644 index 000000000..ed46e93a5 --- /dev/null +++ b/bin/rootemu.h @@ -0,0 +1,10 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains the seccomp filter for root emulation. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" + +void rootemu_init(struct container *c); diff --git a/bin/seccomp.h b/bin/seccomp.h new file mode 100644 index 000000000..ed46e93a5 --- /dev/null +++ b/bin/seccomp.h @@ -0,0 +1,10 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains the seccomp filter for root emulation. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" + +void rootemu_init(struct container *c); diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 3b2dbca22..ec9d63876 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -154,11 +154,6 @@ mounting SquashFS images with FUSE. This is intended for use by :code:`ch-image(1)` when building images; see that man page for a detailed discussion. - :code:`-t`, :code:`--private-tmp` - By default, the host’s :code:`/tmp` (or :code:`$TMPDIR` if set) is - bind-mounted at container :code:`/tmp`. If this is specified, a new - :code:`tmpfs` is mounted on the container’s :code:`/tmp` instead. - :code:`--set-env`, :code:`--set-env=FILE`, :code:`--set-env=VAR=VALUE` Set environment variables with newline-separated file (:code:`/ch/environment` within the image if not specified) or on the @@ -167,6 +162,11 @@ mounting SquashFS images with FUSE. :code:`--set-env0`, :code:`--set-env0=FILE`, :code:`--set-env0=VAR=VALUE` Like :code:`--set-env`, but file is null-byte separated. + :code:`-t`, :code:`--private-tmp` + By default, the host’s :code:`/tmp` (or :code:`$TMPDIR` if set) is + bind-mounted at container :code:`/tmp`. If this is specified, a new + :code:`tmpfs` is mounted on the container’s :code:`/tmp` instead. + :code:`-u`, :code:`--uid=UID` Run as user :code:`UID` within container. @@ -507,8 +507,8 @@ the same. These are: represented in a host path once and then symlinking into it for the declared bind mounts. -Command line options and environment variables ----------------------------------------------- +Selecting devices +----------------- :code:`ch-run` must do two things to make CDI devices available: (1) locate appropriate specification files and (2) select which kinds of CDI devices to @@ -537,6 +537,16 @@ repeated to inject multiple device kinds. Importantly, both :code:`--device` and :code:`--devices` imply :code:`--write-fake` (:code:`-W`) so the container image can be written. +Environment variables +--------------------- + +Injecting a CDI device may require setting environment variables, as declared +in the spec file. These environment changes are executed in the order that +that CDI command line options appear on the command line relative to other +user-specified environment options, e.g. :code:`--set-env` and +:code:`--unset-env`. See :ref:`ch-run_environment-variables` below for +details. + Hooks ------ @@ -594,39 +604,29 @@ Ignored hooks access to all appropriate files. +.. _ch-run_environment-variables: + Environment variables ===================== -:code:`ch-run` leaves environment variables unchanged, i.e. the host -environment is passed through unaltered, except: +Unlike most other implementations, :code:`ch-run`’s baseline for the container +environment is to pass through the host environment unaltered. From this +starting point, the environment is altered in this order: -* by default (:code:`--home` not specified), :code:`HOME` is set to - :code:`/root`, if it exists, and :code:`/` otherwise. -* limited tweaks to avoid significant guest breakage; -* user-set variables via :code:`--set-env`; -* user-unset variables via :code:`--unset-env`; and -* set :code:`CH_RUNNING`. +#. :code:`$HOME`, :code:`$PATH`, and :code:`$TMPDIR` are adjusted to avoid + common breakage (see below). -This section describes these features. +#. User-specified changes are executed in the order they appear on the command + line (i.e., :code:`-d`/:code:`--devices`, :code:`--device`, + :code:`--set-env`, and :code:`--unset-env`, some of which can appear + multiple times). -The default tweaks happen first, then :code:`--set-env` and -:code:`--unset-env` in the order specified on the command line, and then -:code:`CH_RUNNING`. The two options can be repeated arbitrarily many times, -e.g. to add/remove multiple variable sets or add only some variables in a -file. +#. :code:`$CH_RUNNING` is set. -Default behavior ----------------- +Built-in environment changes +---------------------------- -By default, :code:`ch-run` makes the following environment variable changes: - -:code:`$CH_RUNNING` - Set to :code:`Weird Al Yankovic`. While a process can figure out that it’s - in an unprivileged container and what namespaces are active without this - hint, that can be messy, and there is no way to tell that it’s a - *Charliecloud* container specifically. This variable makes such a test - simple and well-defined. (**Note:** This variable is unaffected by - :code:`--unset-env`.) +Prior to user changes, i.e. can be altered by the user: :code:`$HOME` If :code:`--home` is specified, then your home directory is bind-mounted @@ -637,13 +637,12 @@ By default, :code:`ch-run` makes the following environment variable changes: is unchanged.) :code:`$PATH` - Newer Linux distributions replace some root-level directories, such as - :code:`/bin`, with symlinks to their counterparts in :code:`/usr`. - - Some of these distributions (e.g., Fedora 24) have also dropped :code:`/bin` - from the default :code:`$PATH`. This is a problem when the guest OS does - *not* have a merged :code:`/usr` (e.g., Debian 8 “Jessie”). Thus, we add - :code:`/bin` to :code:`$PATH` if it’s not already present. + We append :code:`/bin` to :code:`$PATH` if it’s not already present. This is + because newer Linux distributions replace some root-level directories, such + as :code:`/bin`, with symlinks to their counterparts in :code:`/usr`. Some + of these distributions (e.g., Fedora 24) have also dropped :code:`/bin` from + the default :code:`$PATH`. This is a problem when the guest OS does *not* + have a merged :code:`/usr` (e.g., Debian 8 “Jessie”). Further reading: @@ -656,6 +655,15 @@ By default, :code:`ch-run` makes the following environment variable changes: made available in the guest at :code:`/tmp` unless :code:`--private-tmp` is given. +After user changes, i.e. cannot be altered by the user with :code:`ch-run`: + +:code:`$CH_RUNNING` + Set to :code:`Weird Al Yankovic`. While a process can figure out that it’s + in an unprivileged container and what namespaces are active without this + hint, that can be messy, and there is no way to tell that it’s a + *Charliecloud* container specifically. This variable makes such a test + simple and well-defined. + Setting variables with :code:`--set-env` or :code:`--set-env0` -------------------------------------------------------------- From 8969842fe4a592269465bb58a49fa336f05b0ceb Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Mon, 29 Jul 2024 13:58:40 -0600 Subject: [PATCH 17/29] add and run hooks [skip ci] --- bin/ch-run.c | 2 +- bin/core.c | 25 ++++++++++++++++++++++++- bin/core.h | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 340d1a203..2e2b9cf29 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -310,7 +310,7 @@ int main(int argc, char *argv[]) cdi_update(&args.c, args.cdi_devids); containerize(&args.c); - hooks_prestart(&args.c); + hooks_run(&args.c, args.c.hooks_prestart); run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } diff --git a/bin/core.c b/bin/core.c index 398ff7dae..13fbc3173 100644 --- a/bin/core.c +++ b/bin/core.c @@ -387,7 +387,7 @@ void enter_udss(struct container *c) /* Append hook function f to hook_list. When called, it will be passed d; this lets hooks receive arbitrary arguments (i.e., it’s a poor person’s - closure). + closure). hook_list must be a member of c. Warning: The hook framework does no memory management for name or d, i.e., if name needs to be freed, that is the responsibility of the caller (this @@ -396,6 +396,29 @@ void enter_udss(struct container *c) void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) { // FIXME: hooks: environment variables, seccomp, CDI + + struct hook h; + + T_ (h.name = strdup(name)); + h.f = f; + h.data = data; + + list_append(hook_list, &h, sizeof(h)); +} + +/* Run hooks in hook_list, passing c, then deallocate and set the pointer to + NULL. hook_list must be a member of c. */ +void hooks_run(struct container *c, struct hook **hook_list) +{ + for (int i = 0; (*hook_list)[i] != NULL; i++) { + struct hook *h = (*hook_list)[i]; + VERBOSE("calling hook: %s", h->name); + h->f(c, h->data); + free(h->name); + } + + free(*hook_list); + *hook_list = NULL; } /* Return image type of path, or exit with error if not a valid type. */ diff --git a/bin/core.h b/bin/core.h index 4546d2496..95bbe6dc3 100644 --- a/bin/core.h +++ b/bin/core.h @@ -64,7 +64,7 @@ struct container { void containerize(struct container *c); void hook_add(struct hook **hook_list, hookf_t *f, void *d); -void hooks_prestart(struct container *c); +void hooks_run(struct container *c, struct hook **hook_list); enum img_type image_type(const char *ref, const char *images_dir); char *img_name2path(const char *name, const char *storage_dir); void run_user_command(char *argv[], const char *initial_dir); From 63d6c510bdb79bc9a1fbc00905ef43ba7c8ad48b Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Mon, 29 Jul 2024 14:54:30 -0600 Subject: [PATCH 18/29] make it build [skip ci] --- bin/Makefile.am | 8 ++++---- bin/ch-run.c | 37 ++++++++++++++++++++---------------- bin/core.c | 19 ++++++++++--------- bin/core.h | 4 ++-- bin/hook.c | 5 +++-- bin/hook.h | 9 +++++---- bin/misc.c | 50 +++++++++++++++++++++++++------------------------ bin/misc.h | 2 +- bin/rootemu.h | 10 ---------- configure.ac | 1 + 10 files changed, 73 insertions(+), 72 deletions(-) delete mode 100644 bin/rootemu.h diff --git a/bin/Makefile.am b/bin/Makefile.am index 8dee51c5e..416022b29 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -8,16 +8,16 @@ bin_PROGRAMS = ch-checkns ch-run ch_checkns_SOURCES = ch-checkns.c misc.h misc.c -ch_run_SOURCES = ch-run.c core.h core.c misc.h misc.c +ch_run_SOURCES = ch-run.c core.h core.c hook.h hook.c misc.h misc.c if HAVE_JSON ch_run_SOURCES += json.h json.c endif if HAVE_LIBSQUASHFUSE ch_run_SOURCES += fuse.h fuse.c endif -if HAVE_SECCOMP -ch_run_SOURCES += seccomp.h seccomp.c -endif +#if HAVE_SECCOMP +#ch_run_SOURCES += seccomp.h seccomp.c +#endif # additional build flags for ch-run ch_run_CFLAGS = $(PTHREAD_CFLAGS) diff --git a/bin/ch-run.c b/bin/ch-run.c index 2e2b9cf29..5e207783d 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -14,6 +14,7 @@ #include "config.h" #include "core.h" +#include "hook.h" #ifdef HAVE_JSON #include "json.h" #endif @@ -152,13 +153,12 @@ const struct argp_option options[] = { /** Function prototypes **/ -void fix_environment(struct args *args); bool get_first_env(char **array, char **name, char **value); +void hooks_env_install(struct args *args); void img_directory_verify(const char *img_path, const struct args *args); int join_ct(int cli_ct); char *join_tag(char *cli_tag); -void parse_env(struct env_option **options, enum env_option_type opt, - const char *arg); +void parse_env(struct env_option **opts, enum env_option_type opt, char *arg); int parse_int(char *s, bool extra_ok, char *error_tag); static error_t parse_opt(int key, char *arg, struct argp_state *state); void privs_verify_invoking(); @@ -206,7 +206,7 @@ int main(int argc, char *argv[]) .container_gid = getegid(), .container_uid = geteuid(), .env_expand = true, - .hooks_prestart = list_new(sizeof(struct hook), 0); + .hooks_prestart = list_new(sizeof(struct hook), 0), .host_home = NULL, .img_ref = NULL, .ldconfigs = list_new(sizeof(char *), 0), @@ -246,8 +246,9 @@ int main(int argc, char *argv[]) Z_ (argp_parse(&argp, argc, argv, 0, &arg_next, &args)); if (!argp_help_fmt_set) Z_ (unsetenv("ARGP_HELP_FMT")); + logging_init(args.log_color, args.log_test); - env_hooks_install(&args); + hooks_env_install(&args); if (arg_next >= argc - 1) { printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); @@ -310,7 +311,7 @@ int main(int argc, char *argv[]) cdi_update(&args.c, args.cdi_devids); containerize(&args.c); - hooks_run(&args.c, args.c.hooks_prestart); + hooks_run(&args.c, &args.c.hooks_prestart); run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } @@ -372,11 +373,12 @@ void hook_envs_def_last(struct container *c, void *d) } /* Install pre-start hooks for environment variable changes. */ -void hook_envs_install(struct args *args) +void hooks_env_install(struct args *args) { - hook_add(&args->hooks_prestart, "env-def-first", hook_envs_def_first, NULL); + hook_add(&args->c.hooks_prestart, + "env-def-first", hook_envs_def_first, NULL); - for (size_t i = 0; args->env_options[i].opt != ENV_END; i++) { + for (int i = 0; args->env_options[i].opt != ENV_END; i++) { char *name_base, *name; hookf_t *f; void *d; @@ -391,9 +393,10 @@ void hook_envs_install(struct args *args) struct env_file *ef; name_base = "env-set-gfile"; f = hook_envs_set_file; - T_ (ef = malloc(sizeof struct env_file)); - ef->name = arg; + T_ (ef = malloc(sizeof(struct env_file))); + ef->path = arg; ef->delim = delim; + ef->expand = args->c.env_expand; d = ef; } else { f = hook_envs_set; @@ -414,7 +417,10 @@ void hook_envs_install(struct args *args) case ENV_CDI_DEV: name_base = "env-set-cdi"; f = hook_envs_set; - d = cdi_envs_get(arg); + //d = cdi_envs_get(arg); + break; + case ENV_END: + T_ (false); // unreachable break; } T_ (1 <= asprintf(&name, "%s-%d", name_base, i)); @@ -422,7 +428,7 @@ void hook_envs_install(struct args *args) free(name); } - hook_add(&args->hooks_prestart, "env-def-last", hook_envs_def_last, NULL); + hook_add(&args->c.hooks_prestart, "env-def-last", hook_envs_def_last, NULL); } /* Validate that it’s OK to run the IMG_DIRECTORY format image at path; if @@ -703,14 +709,13 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) return 0; } -void parse_env(struct env_option **options, enum env_option_type opt, - const char *arg) +void parse_env(struct env_option **opts, enum env_option_type opt, char *arg) { struct env_option eo = (struct env_option){ .opt = opt, .arg = arg }; Te (arg == NULL || strlen(arg) > 0, "environment options: argument must have non-zero length"); - list_append((void **)env_options, &eo, sizeof(eo)); + list_append((void **)opts, &eo, sizeof(eo)); } /* Validate that the UIDs and GIDs are appropriate for program start, and diff --git a/bin/core.c b/bin/core.c index 13fbc3173..936c88cb8 100644 --- a/bin/core.c +++ b/bin/core.c @@ -401,20 +401,21 @@ void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) T_ (h.name = strdup(name)); h.f = f; - h.data = data; + h.data = d; - list_append(hook_list, &h, sizeof(h)); + list_append((void **)hook_list, &h, sizeof(h)); } -/* Run hooks in hook_list, passing c, then deallocate and set the pointer to - NULL. hook_list must be a member of c. */ +/* Run hooks in hook_list, passing c, then deallocate the list and set + *hook_list to NULL. hook_list must be a member of c. */ void hooks_run(struct container *c, struct hook **hook_list) { - for (int i = 0; (*hook_list)[i] != NULL; i++) { - struct hook *h = (*hook_list)[i]; - VERBOSE("calling hook: %s", h->name); - h->f(c, h->data); - free(h->name); + + for (int i = 0; (*hook_list)[i].f != NULL; i++) { + struct hook h = (*hook_list)[i]; + VERBOSE("calling hook: %s", h.name); + h.f(c, h.data); + free(h.name); } free(*hook_list); diff --git a/bin/core.h b/bin/core.h index 95bbe6dc3..95ab43b11 100644 --- a/bin/core.h +++ b/bin/core.h @@ -29,7 +29,7 @@ struct hook { char *name; hookf_t *f; void *data; -} +}; enum img_type { IMG_DIRECTORY, // normal directory, perhaps an external mount of some kind @@ -63,7 +63,7 @@ struct container { /** Function prototypes **/ void containerize(struct container *c); -void hook_add(struct hook **hook_list, hookf_t *f, void *d); +void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d); void hooks_run(struct container *c, struct hook **hook_list); enum img_type image_type(const char *ref, const char *images_dir); char *img_name2path(const char *name, const char *storage_dir); diff --git a/bin/hook.c b/bin/hook.c index c607ef18a..c120b52d1 100644 --- a/bin/hook.c +++ b/bin/hook.c @@ -1,7 +1,8 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE -#pragma once +#include + #include "core.h" #include "hook.h" @@ -29,7 +30,7 @@ void hook_envs_set(struct container *c, void *d) void hook_envs_set_file(struct container *c, void *d) { struct env_file *ef = d; - struct env_var *vars = env_file_read(d->path, d->delim); + struct env_var *vars = env_file_read(ef->path, ef->delim); envs_set(vars, c->env_expand); diff --git a/bin/hook.h b/bin/hook.h index c90151a96..ce4426d9f 100644 --- a/bin/hook.h +++ b/bin/hook.h @@ -13,12 +13,13 @@ struct env_file { char *path; + char delim; bool expand; -} +}; /** Function prototypes **/ -void hook_env_set_file(struct container *c, void *d); -void hook_env_set(struct container *c, void *d); -void hook_env_unset(struct container *c, void *d); +void hook_envs_set_file(struct container *c, void *d); +void hook_envs_set(struct container *c, void *d); +void hook_envs_unset(struct container *c, void *d); diff --git a/bin/misc.c b/bin/misc.c index c1ccba605..c3786e334 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -314,43 +314,45 @@ struct env_var *env_file_read(const char *path, int delim) variables in value marked with "$" as described in the man page. */ void env_set(const char *name, const char *value, const bool expand) { - char *value_, *value_expanded; - bool first_written; + char *vwk = NULL; // modifiable copy of value // Walk through value fragments separated by colon and expand variables. - if (!expand) - value_expanded = value; - else { - T_ (value_ = strdup(value)); - value_expanded = ""; - first_written = false; - while (true) { // loop executes ≥ once - char *fgmt = strsep(&value_, ":"); // NULL -> no more items - if (fgmt == NULL) + if (expand) { + char *vwk_cur; // current location in vwk + char *vout = NULL; // output (expanded) string + bool first_out = false; // true after 1st output element written + T_ (vwk = strdup(value)); + vwk_cur = vwk; + while (true) { // loop executes ≥ once + char *elem = strsep(&vwk_cur, ":"); // NULL -> no more elements + if (elem == NULL) break; - if (fgmt[0] == '$' && fgmt[1] != 0) { - fgmt = getenv(fgmt + 1); // NULL if unset - if (fgmt != NULL && fgmt[0] == 0) - fgmt = NULL; // convert empty to unset + if (elem[0] == '$' && elem[1] != 0) { // looks like $VARIABLE + elem = getenv(elem + 1); // NULL if unset + if (elem != NULL && elem[0] == 0) // set but empty + elem = NULL; // convert to unset } - if (fgmt != NULL) { // NULL -> omit from output - if (first_written) - value_expanded = cat(value_expanded, ":"); - value_expanded = cat(value_expanded, fgmt); - first_written = true; + if (elem != NULL) { // empty -> omit from output list + char *vout_old = vout; + T_ (1 <= asprintf(&vout, "%s%s%s", vout_old ? vout_old : "", + !first_out ? ":" : "", elem)); + first_out = true; + free(vout_old); } } + value = vwk; } // Save results. - VERBOSE("environment: %s=%s", name, value_expanded); - Z_ (setenv(name, value_expanded, 1)); + VERBOSE("environment: %s=%s", name, value); + Z_ (setenv(name, value, 1)); + free(vwk); } void envs_set(const struct env_var *vars, const bool expand) { for (size_t i = 0; vars[i].name != NULL; i++) - env_set(env_set(vars[i].name, vars[i].value, expand)); + env_set(vars[i].name, vars[i].value, expand); } /* Remove variables matching glob from the environment. This is tricky, @@ -365,7 +367,7 @@ void envs_set(const struct env_var *vars, const bool expand) [1]: https://unix.stackexchange.com/a/302987 [2]: http://man7.org/linux/man-pages/man3/exec.3p.html */ -void env_unset(const char *glob) +void envs_unset(const char *glob) { char **new_environ = list_new(sizeof(char *), 0); for (size_t i = 0; environ[i] != NULL; i++) { diff --git a/bin/misc.h b/bin/misc.h index d199226a7..07e95bd20 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -128,7 +128,7 @@ int dir_ls_filter(const struct dirent *e); struct env_var *env_file_read(const char *path, int delim); void env_set(const char *name, const char *value, const bool expand); void envs_set(const struct env_var *envs, const bool expand); -void env_unset(const char *glob); +void envs_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); void list_cat(void **dst, void *src, size_t size); diff --git a/bin/rootemu.h b/bin/rootemu.h deleted file mode 100644 index ed46e93a5..000000000 --- a/bin/rootemu.h +++ /dev/null @@ -1,10 +0,0 @@ -/* Copyright © Triad National Security, LLC, and others. - - This interface contains the seccomp filter for root emulation. */ - -#define _GNU_SOURCE -#pragma once - -#include "core.h" - -void rootemu_init(struct container *c); diff --git a/configure.ac b/configure.ac index 9bedac152..0f5990d06 100644 --- a/configure.ac +++ b/configure.ac @@ -881,6 +881,7 @@ AS_IF([test $have_overlayfs = yes], [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) AS_IF([test $have_seccomp = yes], [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) +AM_CONDITIONAL([HAVE_SECCOMP], [test $have_seccomp = yes]) AS_IF([test $have_tmpfs_xattrs = yes], [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) From 86aaf5691581e9cb6e9ce53faa5e78417bc61e15 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Mon, 29 Jul 2024 15:15:27 -0600 Subject: [PATCH 19/29] ok it actually works? [skip ci] --- bin/ch-run.c | 1 + bin/core.c | 1 - bin/hook.c | 6 ++---- bin/misc.c | 11 +++++++++++ bin/misc.h | 1 + 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 5e207783d..2f0001cb8 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -406,6 +406,7 @@ void hooks_env_install(struct args *args) } else { // direct set name_base = "env-set-direct"; d = list_new(sizeof(struct env_var), 1); + ((struct env_var *)d)[0] = env_var_parse(arg, NULL, 0); } } break; diff --git a/bin/core.c b/bin/core.c index 936c88cb8..895d7114d 100644 --- a/bin/core.c +++ b/bin/core.c @@ -410,7 +410,6 @@ void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) *hook_list to NULL. hook_list must be a member of c. */ void hooks_run(struct container *c, struct hook **hook_list) { - for (int i = 0; (*hook_list)[i].f != NULL; i++) { struct hook h = (*hook_list)[i]; VERBOSE("calling hook: %s", h.name); diff --git a/bin/hook.c b/bin/hook.c index c120b52d1..1966e222a 100644 --- a/bin/hook.c +++ b/bin/hook.c @@ -20,8 +20,7 @@ void hook_envs_set(struct container *c, void *d) struct env_var *vars = d; envs_set(vars, c->env_expand); - - free(vars); + envs_free(&vars); } /* Set the environment variables specified in file d, then free d. NOTE: @@ -33,8 +32,7 @@ void hook_envs_set_file(struct container *c, void *d) struct env_var *vars = env_file_read(ef->path, ef->delim); envs_set(vars, c->env_expand); - - free(vars); + envs_free(&vars); free(ef); } diff --git a/bin/misc.c b/bin/misc.c index c3786e334..873997d0c 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -349,6 +349,17 @@ void env_set(const char *name, const char *value, const bool expand) free(vwk); } +/* Free the environment variabls list *vars, both the individual buffers within + as well as the whole list, then set *vars to NULL. */ +void envs_free(struct env_var **vars) +{ + for (int i = 0; (*vars)[i].name != NULL; i++) + free((*vars)[i].name); // .value points into same buffer; see split() + + free(*vars); + *vars = NULL; +} + void envs_set(const struct env_var *vars, const bool expand) { for (size_t i = 0; vars[i].name != NULL; i++) diff --git a/bin/misc.h b/bin/misc.h index 07e95bd20..7987a210a 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -127,6 +127,7 @@ int dir_ls_count(const char *path); int dir_ls_filter(const struct dirent *e); struct env_var *env_file_read(const char *path, int delim); void env_set(const char *name, const char *value, const bool expand); +void envs_free(struct env_var **vars); void envs_set(const struct env_var *envs, const bool expand); void envs_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); From c4adb2e76c115d7495a1c638db8a0408ceb52020 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Wed, 31 Jul 2024 13:48:19 -0600 Subject: [PATCH 20/29] move seccomp into its own file and be a hook [skip ci] --- bin/Makefile.am | 6 +- bin/ch-run.c | 52 +++-- bin/core.c | 589 +++++++++++++++--------------------------------- bin/core.h | 9 +- bin/json.c | 80 +++---- bin/json.h | 2 +- bin/misc.c | 23 +- bin/misc.h | 1 + bin/seccomp.c | 258 +++++++++++++++++++++ bin/seccomp.h | 2 +- 10 files changed, 540 insertions(+), 482 deletions(-) create mode 100644 bin/seccomp.c diff --git a/bin/Makefile.am b/bin/Makefile.am index 416022b29..ac34038fa 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -15,9 +15,9 @@ endif if HAVE_LIBSQUASHFUSE ch_run_SOURCES += fuse.h fuse.c endif -#if HAVE_SECCOMP -#ch_run_SOURCES += seccomp.h seccomp.c -#endif +if HAVE_SECCOMP +ch_run_SOURCES += seccomp.h seccomp.c +endif # additional build flags for ch-run ch_run_CFLAGS = $(PTHREAD_CFLAGS) diff --git a/bin/ch-run.c b/bin/ch-run.c index 2f0001cb8..569f3ee6c 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -19,6 +19,9 @@ #include "json.h" #endif #include "misc.h" +#ifdef HAVE_SECCOMP +#include "seccomp.h" +#endif /** Types **/ @@ -228,9 +231,6 @@ int main(int argc, char *argv[]) .initial_dir = NULL, .log_color = LL_COLOR_AUTO, .log_test = LL_TEST_NONE, -#ifdef HAVE_SECCOMP - .seccomp_p = false, -#endif .storage_dir = storage_default(), .unsafe = false }; @@ -248,7 +248,6 @@ int main(int argc, char *argv[]) Z_ (unsetenv("ARGP_HELP_FMT")); logging_init(args.log_color, args.log_test); - hooks_env_install(&args); if (arg_next >= argc - 1) { printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); @@ -286,15 +285,15 @@ int main(int argc, char *argv[]) args.c.join_tag = join_tag(args.c.join_tag); } + c_argv = list_new(sizeof(char *), argc - arg_next); + for (int i = 0; i < argc - arg_next; i++) + c_argv[i] = argv[i + arg_next]; + if (getenv("TMPDIR") != NULL) host_tmp = getenv("TMPDIR"); else host_tmp = "/tmp"; - c_argv = list_new(sizeof(char *), argc - arg_next); - for (int i = 0; i < argc - arg_next; i++) - c_argv[i] = argv[i + arg_next]; - VERBOSE("verbosity: %d", verbose); VERBOSE("image: %s", args.c.img_ref); VERBOSE("storage: %s", args.storage_dir); @@ -305,13 +304,16 @@ int main(int argc, char *argv[]) args.c.join_pid); VERBOSE("private /tmp: %d", args.c.private_tmp); #ifdef HAVE_SECCOMP - VERBOSE("seccomp: %d", args.seccomp_p); + VERBOSE("seccomp: %s", bool_to_string(args.seccomp_p)); #endif - VERBOSE("unsafe: %d", args.unsafe); + VERBOSE("unsafe: %s", bool_to_string(args.unsafe)); + +#ifdef HAVE_JSON + cdi_init(&args.c, args.cdi_devids); +#endif + hooks_env_install(&args); - cdi_update(&args.c, args.cdi_devids); containerize(&args.c); - hooks_run(&args.c, &args.c.hooks_prestart); run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } @@ -375,11 +377,11 @@ void hook_envs_def_last(struct container *c, void *d) /* Install pre-start hooks for environment variable changes. */ void hooks_env_install(struct args *args) { - hook_add(&args->c.hooks_prestart, + hook_add(&args->c.hooks_prestart, HOOK_DUP_FAIL, "env-def-first", hook_envs_def_first, NULL); for (int i = 0; args->env_options[i].opt != ENV_END; i++) { - char *name_base, *name; + char *name; hookf_t *f; void *d; enum env_option_type opt = args->env_options[i].opt; @@ -391,7 +393,7 @@ void hooks_env_install(struct args *args) int delim = ENV_SET ? '\n' : '\0'; if (args == NULL) { // guest path; defer file read struct env_file *ef; - name_base = "env-set-gfile"; + name = "env-set-gfile"; f = hook_envs_set_file; T_ (ef = malloc(sizeof(struct env_file))); ef->path = arg; @@ -401,22 +403,22 @@ void hooks_env_install(struct args *args) } else { f = hook_envs_set; if (strchr(arg, '=') == NULL) { // host path; read file now - name_base = "env-set-hfile"; + name = "env-set-hfile"; d = env_file_read(arg, delim); } else { // direct set - name_base = "env-set-direct"; + name = "env-set-direct"; d = list_new(sizeof(struct env_var), 1); ((struct env_var *)d)[0] = env_var_parse(arg, NULL, 0); } } break; case ENV_UNSET: - name_base = "env-unset"; + name = "env-unset"; f = hook_envs_unset; d = arg; break; case ENV_CDI_DEV: - name_base = "env-set-cdi"; + name = "env-set-cdi"; f = hook_envs_set; //d = cdi_envs_get(arg); break; @@ -424,12 +426,11 @@ void hooks_env_install(struct args *args) T_ (false); // unreachable break; } - T_ (1 <= asprintf(&name, "%s-%d", name_base, i)); - hook_add(&args->c.hooks_prestart, name, f, d); - free(name); + hook_add(&args->c.hooks_prestart, HOOK_DUP_OK, name, f, d); } - hook_add(&args->c.hooks_prestart, "env-def-last", hook_envs_def_last, NULL); + hook_add(&args->c.hooks_prestart, HOOK_DUP_FAIL, + "env-def-last", hook_envs_def_last, NULL); } /* Validate that it’s OK to run the IMG_DIRECTORY format image at path; if @@ -588,7 +589,8 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; #ifdef HAVE_SECCOMP case -14: // --seccomp - args->seccomp_p = true; + hook_add(&args->c.hooks_prestart, HOOK_DUP_SKIP, + "seccomp", hook_seccomp_install, NULL); break; #endif case -15: // --set-env0 @@ -705,7 +707,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) exit(EXIT_FAILURE); default: return ARGP_ERR_UNKNOWN; - }; + } return 0; } diff --git a/bin/core.c b/bin/core.c index 895d7114d..023ca0591 100644 --- a/bin/core.c +++ b/bin/core.c @@ -3,22 +3,12 @@ #define _GNU_SOURCE #include "config.h" -#include #include -#include -#ifdef HAVE_SECCOMP -#include -#include -#include -#endif #include #include #include +#include #include -#ifdef HAVE_SECCOMP -#include -#include -#endif #include #include #include @@ -88,92 +78,6 @@ struct bind BINDS_DEFAULT[] = { { 0 } }; -/* Special values for seccomp tables. These must be negative to avoid clashing - with real syscall numbers (note zero is often a valid syscal number). */ -#define NR_NON -1 // syscall does not exist on architecture -#define NR_END -2 // end of table - -/* Architectures that we support for seccomp. Order matches the - corresponding table below. - - Note: On some distros (e.g., CentOS 7), some of the architecture numbers - are missing. The workaround is to use the numbers I have on Debian - Bullseye. The reason I (Reid) feel moderately comfortable doing this is how - militant Linux is about not changing the userspace API. */ -#ifdef HAVE_SECCOMP -#ifndef AUDIT_ARCH_AARCH64 -#define AUDIT_ARCH_AARCH64 0xC00000B7u // undeclared on CentOS 7 -#undef AUDIT_ARCH_ARM // uses undeclared EM_ARM on CentOS 7 -#define AUDIT_ARCH_ARM 0x40000028u -#endif -int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64 - AUDIT_ARCH_ARM, // arm32 - AUDIT_ARCH_I386, // x86 (32-bit) - AUDIT_ARCH_PPC64LE, // PPC - AUDIT_ARCH_S390X, // s390x - AUDIT_ARCH_X86_64, // x86-64 - NR_END }; -#endif - -/* System call numbers that we fake with seccomp (by doing nothing and - returning success). Some processors can execute multiple architectures - (e.g., 64-bit Intel CPUs can run both x64-64 and x86 code), and a process’ - architecture can even change (if you execve(2) binary of different - architecture), so we can’t just use the build host’s architecture. - - I haven’t figured out how to gather these system call numbers - automatically, so they are compiled from [1, 2, 3]. See also [4] for a more - general reference. - - NOTE: The total number of faked syscalls (i.e., non-zero entries below) - must be somewhat less than 256. I haven’t computed the exact limit. There - will be an assertion failure at runtime if this is exceeded. - - WARNING: Keep this list consistent with the ch-image(1) man page! - - [1]: https://chromium.googlesource.com/chromiumos/docs/+/HEAD/constants/syscalls.md#Cross_arch-Numbers - [2]: https://github.com/strace/strace/blob/v4.26/linux/powerpc64/syscallent.h - [3]: https://github.com/strace/strace/blob/v6.6/src/linux/s390x/syscallent.h - [4]: https://unix.stackexchange.com/questions/421750 */ -#ifdef HAVE_SECCOMP -int FAKE_SYSCALL_NRS[][6] = { - // arm64 arm32 x86 PPC64 s390x x86-64 - // ------ ------ ------ ------ ------ ------ - { 91, 185, 185, 184, 185, 126 }, // capset - { NR_NON, 182, 182, 181, 212, 92 }, // chown - { NR_NON, 212, 212, NR_NON, NR_NON, NR_NON }, // chown32 - { 55, 95, 95, 95, 207, 93 }, // fchown - { NR_NON, 207, 207, NR_NON, NR_NON, NR_NON }, // fchown32 - { 54, 325, 298, 289, 291, 260 }, // fchownat - { NR_NON, 16, 16, 16, 198, 94 }, // lchown - { NR_NON, 198, 198, NR_NON, NR_NON, NR_NON }, // lchown32 - { 104, 347, 283, 268, 277, 246 }, // kexec_load - { 152, 139, 139, 139, 216, 123 }, // setfsgid - { NR_NON, 216, 216, NR_NON, NR_NON, NR_NON }, // setfsgid32 - { 151, 138, 138, 138, 215, 122 }, // setfsuid - { NR_NON, 215, 215, NR_NON, NR_NON, NR_NON }, // setfsuid32 - { 144, 46, 46, 46, 214, 106 }, // setgid - { NR_NON, 214, 214, NR_NON, NR_NON, NR_NON }, // setgid32 - { 159, 81, 81, 81, 206, 116 }, // setgroups - { NR_NON, 206, 206, NR_NON, NR_NON, NR_NON }, // setgroups32 - { 143, 71, 71, 71, 204, 114 }, // setregid - { NR_NON, 204, 204, NR_NON, NR_NON, NR_NON }, // setregid32 - { 149, 170, 170, 169, 210, 119 }, // setresgid - { NR_NON, 210, 210, NR_NON, NR_NON, NR_NON }, // setresgid32 - { 147, 164, 164, 164, 208, 117 }, // setresuid - { NR_NON, 208, 208, NR_NON, NR_NON, NR_NON }, // setresuid32 - { 145, 70, 70, 70, 203, 113 }, // setreuid - { NR_NON, 203, 203, NR_NON, NR_NON, NR_NON }, // setreuid32 - { 146, 23, 23, 23, 213, 105 }, // setuid - { NR_NON, 213, 213, NR_NON, NR_NON, NR_NON }, // setuid32 - { NR_END }, // end -}; -int FAKE_MKNOD_NRS[] = - { NR_NON, 14, 14, 14, 14, 133 }; -int FAKE_MKNODAT_NRS[] = - { 33, 324, 297, 288, 290, 259 }; -#endif - /** Global variables **/ @@ -199,19 +103,16 @@ void bind_mount(const char *src, const char *dst, enum bind_dep, const char *newroot, unsigned long flags, const char *scratch); void bind_mounts(const struct bind *binds, const char *newroot, unsigned long flags, const char * scratch); -void enter_udss(struct container *c); -#ifdef HAVE_SECCOMP -void iw(struct sock_fprog *p, int i, - uint16_t op, uint32_t k, uint8_t jt, uint8_t jf); -#endif void join_begin(const char *join_tag); -void join_namespace(pid_t pid, const char *ns); -void join_namespaces(pid_t pid); void join_end(int join_ct); -void sem_timedwait_relative(sem_t *sem, int timeout); -void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, +void mounts_setup(struct container *c); +void namespace_join(pid_t pid, const char *ns); +void namespaces_join(pid_t pid); +void namespaces_setup(const struct container *c, uid_t uid_out, uid_t uid_in, gid_t gid_out, gid_t gid_in); -void setup_passwd(const struct container *c); +void passwd_setup(const struct container *c); +void pivot(struct container *c); +void sem_timedwait_relative(sem_t *sem, int timeout); void tmpfs_mount(const char *dst, const char *newroot, const char *data); @@ -268,7 +169,7 @@ void bind_mounts(const struct bind *binds, const char *newroot, void containerize(struct container *c) { if (c->join_pid) { - join_namespaces(c->join_pid); + namespaces_join(c->join_pid); return; } if (c->join) @@ -278,127 +179,58 @@ void containerize(struct container *c) // fusermount3 non-setuid, and the inner so we get the desired UID // within the container. We do this even if the image is a directory, to // reduce the number of code paths. - setup_namespaces(c, geteuid(), 0, getegid(), 0); + namespaces_setup(c, geteuid(), 0, getegid(), 0); #ifdef HAVE_LIBSQUASHFUSE if (c->type == IMG_SQUASH) sq_fork(c); #endif - setup_namespaces(c, 0, c->container_uid, 0, c->container_gid); - enter_udss(c); + namespaces_setup(c, 0, c->container_uid, 0, c->container_gid); + mounts_setup(c); + VERBOSE("prestart hooks: %d", list_count(c->hooks_prestart, + sizeof(struct hook))); + hooks_run(c, &c->hooks_prestart); + pivot(c); } else - join_namespaces(join.shared->winner_pid); + namespaces_join(join.shared->winner_pid); if (c->join) join_end(c->join_ct); } -/* Enter the new root (UDSS). On entry, the namespaces are set up, and this - does the mounting and filesystem setup. - - Note that pivot_root(2) requires a complex dance to work, i.e., to avoid - multiple undocumented error conditions. This dance is explained in detail - in bin/ch-checkns.c. */ -void enter_udss(struct container *c) -{ - char *nr_parent, *nr_base, *mkdir_scratch; - - LOG_IDS; - mkdir_scratch = NULL; - path_split(c->newroot, &nr_parent, &nr_base); +/* Append hook function f to hook_list. When called, the hook will be passed + d; this lets hooks receive arbitrary arguments (i.e., it’s a poor person’s + closure). hook_list must be a member of c. - // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do - // need both calls to avoid pivot_root(2) failing with EBUSY later. - DEBUG("claiming new root for this namespace") - bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); - bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); - // Re-mount new root read-only unless --write or already read-only. - if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { - unsigned long flags = path_mount_flags(c->newroot) - | MS_REMOUNT // Re-mount ... - | MS_BIND // only this mount point ... - | MS_RDONLY; // read-only. - Z_ (mount(NULL, c->newroot, NULL, flags, NULL)); - } - // Overlay a tmpfs if --write-fake. See for useful details: - // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html - // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html - if (c->overlay_size != NULL) { - char *options; - struct stat st; - VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); - T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); - Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), - "cannot mount tmpfs for overlay"); - free(options); - Z_ (mkdir(WF_MNT "/upper", 0700)); - Z_ (mkdir(WF_MNT "/work", 0700)); - Z_ (mkdir(WF_MNT "/merged", 0700)); - mkdir_scratch = WF_MNT "/mkdir_overmount"; - Z_ (mkdir(mkdir_scratch, 0700)); - T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s," - "index=on,userxattr,volatile"), - c->newroot, WF_MNT "/upper", WF_MNT "/work")); - // update newroot - Zf (stat(c->newroot, &st), - "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); - c->newroot = WF_MNT "/merged"; - free(nr_parent); - free(nr_base); - path_split(c->newroot, &nr_parent, &nr_base); - Zf (mount(NULL, c->newroot, "overlay", 0, options), - "can't overlay: %s, %s", c->newroot, options); - VERBOSE("newroot updated: %s", c->newroot); - free(options); - } - DEBUG("starting bind-mounts"); - // Bind-mount default files and directories. - bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL); - // /etc/passwd and /etc/group. - if (!c->private_passwd) - setup_passwd(c); - // Container /tmp. - if (c->private_tmp) { - tmpfs_mount("/tmp", c->newroot, NULL); - } else { - bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL); - } - // Bind-mount user’s home directory at /home/$USER if requested. - if (c->host_home) { - T_ (c->overlay_size != NULL); - bind_mount(c->host_home, cat("/home/", username), - BD_MAKE_DST, c->newroot, 0, mkdir_scratch); - } - // Bind-mount user-specified directories. - bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); - // Overmount / to avoid EINVAL if it’s a rootfs. - Z_ (chdir(nr_parent)); - Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL)); - Z_ (chroot(".")); - // Pivot into the new root. Use /dev because it’s available even in - // extremely minimal images. - c->newroot = cat("/", nr_base); - Zf (chdir(c->newroot), "can't chdir into new root"); - Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")), - "can't pivot_root(2)"); - Zf (chroot("."), "can't chroot(2) into new root"); - Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); - DEBUG("pivot_root(2) dance successful") -} + “dup” says what to do if a hook with the same name is already in the list: -/* Append hook function f to hook_list. When called, it will be passed d; this - lets hooks receive arbitrary arguments (i.e., it’s a poor person’s - closure). hook_list must be a member of c. + HOOK_DUP_OK add the hook anyway + HOOK_DUP_SKIP silently do nothing (i.e., don’t add the hook) + HOOK_DUP_FAIL fatal error Warning: The hook framework does no memory management for name or d, i.e., if name needs to be freed, that is the responsibility of the caller (this function uses a copy), and/or if anything in d either needs to be freed, that is the responsibility of the hook. */ -void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) +void hook_add(struct hook **hook_list, enum hook_dup dup, + const char *name, hookf_t *f, void *d) { // FIXME: hooks: environment variables, seccomp, CDI struct hook h; + if (dup == HOOK_DUP_SKIP || dup == HOOK_DUP_FAIL) { + bool dup_found = false; + for (int i = 0; (*hook_list)[i].name != NULL; i++) + if (!strcmp((*hook_list)[i].name, name)) { + dup_found = true; + break; + } + if (dup_found) { + Te (dup == HOOK_DUP_SKIP, "invalid duplicate hook: %s", name); + return; // skip adding hook + } + } + T_ (h.name = strdup(name)); h.f = f; h.data = d; @@ -410,9 +242,10 @@ void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d) *hook_list to NULL. hook_list must be a member of c. */ void hooks_run(struct container *c, struct hook **hook_list) { + int hook_ct = list_count(*hook_list, sizeof((*hook_list)[0])); for (int i = 0; (*hook_list)[i].f != NULL; i++) { struct hook h = (*hook_list)[i]; - VERBOSE("calling hook: %s", h.name); + DEBUG("calling hook %d/%d: %s", i+1, hook_ct, h.name); h.f(c, h.data); free(h.name); } @@ -474,16 +307,6 @@ char *img_name2path(const char *name, const char *storage_dir) return path; } -/* Helper function to write seccomp-bpf programs. */ -#ifdef HAVE_SECCOMP -void iw(struct sock_fprog *p, int i, - uint16_t op, uint32_t k, uint8_t jt, uint8_t jf) -{ - p->filter[i] = (struct sock_filter){ op, jt, jf, k }; - DEBUG("%4d: { op=%2x k=%8x jt=%3d jf=%3d }", i, op, k, jt, jf); -} -#endif - /* Begin coordinated section of namespace joining. */ void join_begin(const char *join_tag) { @@ -551,8 +374,83 @@ void join_end(int join_ct) VERBOSE("join: done"); } +/* Set up the container filesystem tree. Namespaces must already be done. */ +void mounts_setup(struct container *c) +{ + char *nr_parent, *mkdir_scratch; + + VERBOSE("creating container filesystem tree"); + LOG_IDS; + mkdir_scratch = NULL; + path_split(c->newroot, &nr_parent, NULL); + + // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do + // need both calls to avoid pivot_root(2) failing with EBUSY later. + DEBUG("claiming new root for this namespace") + bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); + bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); + // Re-mount new root read-only unless --write or already read-only. + if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { + unsigned long flags = path_mount_flags(c->newroot) + | MS_REMOUNT // Re-mount ... + | MS_BIND // only this mount point ... + | MS_RDONLY; // read-only. + Z_ (mount(NULL, c->newroot, NULL, flags, NULL)); + } + // Overlay a tmpfs if --write-fake. See for useful details: + // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html + // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html + if (c->overlay_size != NULL) { + char *options; + struct stat st; + VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); + T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); + Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), + "cannot mount tmpfs for overlay"); + free(options); + Z_ (mkdir(WF_MNT "/upper", 0700)); + Z_ (mkdir(WF_MNT "/work", 0700)); + Z_ (mkdir(WF_MNT "/merged", 0700)); + mkdir_scratch = WF_MNT "/mkdir_overmount"; + Z_ (mkdir(mkdir_scratch, 0700)); + T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s," + "index=on,userxattr,volatile"), + c->newroot, WF_MNT "/upper", WF_MNT "/work")); + // update newroot + Zf (stat(c->newroot, &st), + "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); + c->newroot = WF_MNT "/merged"; + Zf (mount(NULL, c->newroot, "overlay", 0, options), + "can't overlay: %s, %s", c->newroot, options); + VERBOSE("newroot updated: %s", c->newroot); + free(options); + } + DEBUG("starting bind-mounts"); + // Bind-mount default files and directories. + bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL); + // /etc/passwd and /etc/group. + if (!c->private_passwd) + passwd_setup(c); + // Container /tmp. + if (c->private_tmp) { + tmpfs_mount("/tmp", c->newroot, NULL); + } else { + bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL); + } + // Bind-mount user’s home directory at /home/$USER if requested. + if (c->host_home) { + T_ (c->overlay_size != NULL); + bind_mount(c->host_home, cat("/home/", username), + BD_MAKE_DST, c->newroot, 0, mkdir_scratch); + } + // Bind-mount user-specified directories. + bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); + + free(nr_parent); +} + /* Join a specific namespace. */ -void join_namespace(pid_t pid, const char *ns) +void namespace_join(pid_t pid, const char *ns) { char *path; int fd; @@ -579,197 +477,23 @@ void join_namespace(pid_t pid, const char *ns) } } -/* Join the existing namespaces created by the join winner. */ -void join_namespaces(pid_t pid) +/* Join the existing namespaces containing process pid, which could be the + join winner or another process. */ +void namespaces_join(pid_t pid) { VERBOSE("joining namespaces of pid %d", pid); - join_namespace(pid, "user"); - join_namespace(pid, "mnt"); -} - -/* Replace the current process with user command and arguments. */ -void run_user_command(char *argv[], const char *initial_dir) -{ - LOG_IDS; - - if (initial_dir != NULL) - Zf (chdir(initial_dir), "can't cd to %s", initial_dir); - - VERBOSE("executing: %s", argv_to_string(argv)); - - Zf (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), "can't set no_new_privs"); - if (verbose < LL_INFO) - T_ (freopen("/dev/null", "w", stdout)); - if (verbose < LL_STDERR) - T_ (freopen("/dev/null", "w", stderr)); - execvp(argv[0], argv); // only returns if error - Tf (0, "can't execve(2): %s", argv[0]); -} - -/* Set up the fake-syscall seccomp(2) filter. This computes and installs a - long-ish but fairly simple BPF program to implement the filter. To - understand this rather hairy language: - - 1. https://man7.org/training/download/secisol_seccomp_slides.pdf - 2. https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html - 3. https://elixir.bootlin.com/linux/latest/source/samples/seccomp */ -#ifdef HAVE_SECCOMP -void seccomp_install(void) -{ - int arch_ct = sizeof(SECCOMP_ARCHS)/sizeof(SECCOMP_ARCHS[0]) - 1; - int syscall_cts[arch_ct]; - struct sock_fprog p = { 0 }; - int ii, idx_allow, idx_fake, idx_mknod, idx_mknodat, idx_next_arch; - // Lengths of certain instruction groups. These are all obtained manually - // by counting below, violating DRY. We could automate these counts, but it - // seemed like the cost of extra buffers and code to do that would exceed - // that of maintaining the manual counts. - int ct_jump_start = 4; // ld arch & syscall nr, arch test, end-of-arch jump - int ct_mknod_jump = 2; // jump table handling for mknod(2) and mknodat(2) - int ct_mknod = 2; // mknod(2) handling - int ct_mknodat = 6; // mknodat(2) handling - - // Count how many syscalls we are going to fake in the standard way. We - // need this to compute the right offsets for all the jumps. - for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { - p.len += ct_jump_start + ct_mknod_jump; - syscall_cts[ai] = 0; - for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { - bool syscall_p = FAKE_SYSCALL_NRS[si][ai] != NR_NON; - syscall_cts[ai] += syscall_p; - p.len += syscall_p; // syscall jump table entry - } - DEBUG("seccomp: arch %x: found %d syscalls", - SECCOMP_ARCHS[ai], syscall_cts[ai]); - } - - // Initialize program buffer. - p.len += ( 1 // return allow - + 1 // return fake success - + ct_mknod // mknod(2) handling - + ct_mknodat); // mknodat(2) handling - DEBUG("seccomp(2) program has %d instructions", p.len); - T_ (p.filter = calloc(p.len, sizeof(struct sock_filter))); - - // Return call addresses. Allow needs to come first because we’ll jump to - // it for unknown architectures. - idx_allow = p.len - 2 - ct_mknod - ct_mknodat; - idx_fake = p.len - 1 - ct_mknod - ct_mknodat; - idx_mknod = p.len - ct_mknod - ct_mknodat; - idx_mknodat = p.len - ct_mknodat; - - // Build a jump table for each architecture. The gist is: if architecture - // matches, fall through into the jump table, otherwise jump to the next - // architecture (or ALLOW for the last architecture). - ii = 0; - idx_next_arch = -1; // avoid warning on some compilers - for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { - int jump; - idx_next_arch = ii + syscall_cts[ai] + ct_jump_start + ct_mknod_jump; - // load arch into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, arch), 0, 0); - // jump to next arch if arch doesn't match - jump = idx_next_arch - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, SECCOMP_ARCHS[ai], 0, jump); - // load syscall number into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr), 0, 0); - // jump table of syscalls - for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { - int nr = FAKE_SYSCALL_NRS[si][ai]; - if (nr != NR_NON) { - jump = idx_fake - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, nr, jump, 0); - } - } - // jump to mknod(2) handling (add even if syscall not implemented to - // make the instruction counts simpler) - jump = idx_mknod - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNOD_NRS[ai], jump, 0); - // jump to mknodat(2) handling - jump = idx_mknodat - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNODAT_NRS[ai], jump, 0); - // unfiltered syscall, jump to allow (limit of 255 doesn’t apply to JA) - jump = idx_allow - ii - 1; - iw(&p, ii++, BPF_JMP|BPF_JA, jump, 0, 0); - } - T_ (idx_next_arch == idx_allow); - - // Returns. (Note that if we wanted a non-zero errno, we’d bitwise-or with - // SECCOMP_RET_ERRNO. But because fake success is errno == 0, we don’t need - // a no-op “| 0”.) - T_ (ii == idx_allow); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); - T_ (ii == idx_fake); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); - - // mknod(2) handling. This just loads the file mode and jumps to the right - // place in the mknodat(2) handling. - T_ (ii == idx_mknod); - // load mode argument into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, args[1]), 0, 0); - // jump to mode test - iw(&p, ii++, BPF_JMP|BPF_JA, 1, 0, 0); - - // mknodat(2) handling. - T_ (ii == idx_mknodat); - // load mode argument into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, args[2]), 0, 0); - // jump to fake return if trying to create a device. - iw(&p, ii++, BPF_ALU|BPF_AND|BPF_K, S_IFMT, 0, 0); // file type only - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFCHR, 2, 0); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFBLK, 1, 0); - // returns - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); - - // Install filter. Use prctl(2) rather than seccomp(2) for slightly greater - // compatibility (Linux 3.5 rather than 3.17) and because there is a glibc - // wrapper. - T_ (ii == p.len); // next instruction now one past the end of the buffer - Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p)); - DEBUG("note: see FAQ to disassemble the above") - - // Test filter. This will fail if the kernel executes the call (because we - // are not really privileged and the arguments are bogus) or succeed if - // filter handles it. We selected it over something more naturally in the - // filter, e.g. setuid(2), because (1) no container process should ever use - // it and (2) it’s unlikely to be emulated by a smarter filter in the - // future, i.e., it won’t silently start doing something. - Zf (syscall(SYS_kexec_load, 0, 0, NULL, 0), - "seccomp root emulation failed (is your architecture supported?)"); -} -#endif - -/* Wait for semaphore sem for up to timeout seconds. If timeout or an error, - exit unsuccessfully. */ -void sem_timedwait_relative(sem_t *sem, int timeout) -{ - struct timespec deadline; - - // sem_timedwait() requires a deadline rather than a timeout. - Z_ (clock_gettime(CLOCK_REALTIME, &deadline)); - deadline.tv_sec += timeout; - - if (sem_timedwait(sem, &deadline)) { - Ze (errno == ETIMEDOUT, "timeout waiting for join lock"); - Tf (0, "failure waiting for join lock"); - } + namespace_join(pid, "user"); + namespace_join(pid, "mnt"); } /* Activate the desired isolation namespaces. */ -void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, +void namespaces_setup(const struct container *c, uid_t uid_out, uid_t uid_in, gid_t gid_out, gid_t gid_in) { int fd; + VERBOSE("setting up namespaces: %d:%d -> %d:%d", + uid_out, gid_out, uid_in, gid_in); LOG_IDS; Zf (unshare(CLONE_NEWNS|CLONE_NEWUSER), "can't init user+mount namespaces"); LOG_IDS; @@ -812,7 +536,7 @@ void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, see issue #212. After bind-mounting, we remove the files from the host; they persist inside the container and then disappear completely when the container exits. */ -void setup_passwd(const struct container *c) +void passwd_setup(const struct container *c) { int fd; char *path; @@ -868,6 +592,67 @@ void setup_passwd(const struct container *c) Z_ (unlink(path)); } +/* Pivot into the container. Note that pivot_root(2) requires a complex dance + to work, i.e., to avoid multiple undocumented error conditions. This dance + is explained in detail in bin/ch-checkns.c. */ +void pivot(struct container *c) +{ + char *nr_parent, *nr_base; + + VERBOSE("pivoting into container"); + path_split(c->newroot, &nr_parent, &nr_base); + + // Overmount / to avoid EINVAL if it’s a rootfs. + Z_ (chdir(nr_parent)); + Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL)); + Z_ (chroot(".")); + // Pivot into the new root. Use /dev because it’s available even in + // extremely minimal images. + c->newroot = cat("/", nr_base); + Zf (chdir(c->newroot), "can't chdir into new root"); + Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")), + "can't pivot_root(2)"); + Zf (chroot("."), "can't chroot(2) into new root"); + Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); + free(nr_parent); + free(nr_base); +} + +/* Replace the current process with user command and arguments. */ +void run_user_command(char *argv[], const char *initial_dir) +{ + LOG_IDS; + + if (initial_dir != NULL) + Zf (chdir(initial_dir), "can't cd to %s", initial_dir); + + VERBOSE("executing: %s", argv_to_string(argv)); + + Zf (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), "can't set no_new_privs"); + if (verbose < LL_INFO) + T_ (freopen("/dev/null", "w", stdout)); + if (verbose < LL_STDERR) + T_ (freopen("/dev/null", "w", stderr)); + execvp(argv[0], argv); // only returns if error + Tf (0, "can't execve(2): %s", argv[0]); +} + +/* Wait for semaphore sem for up to timeout seconds. If timeout or an error, + exit unsuccessfully. */ +void sem_timedwait_relative(sem_t *sem, int timeout) +{ + struct timespec deadline; + + // sem_timedwait() requires a deadline rather than a timeout. + Z_ (clock_gettime(CLOCK_REALTIME, &deadline)); + deadline.tv_sec += timeout; + + if (sem_timedwait(sem, &deadline)) { + Ze (errno == ETIMEDOUT, "timeout waiting for join lock"); + Tf (0, "failure waiting for join lock"); + } +} + /* Mount a tmpfs at the given path. */ void tmpfs_mount(const char *dst, const char *newroot, const char *data) { diff --git a/bin/core.h b/bin/core.h index 95ab43b11..ac825babe 100644 --- a/bin/core.h +++ b/bin/core.h @@ -31,6 +31,12 @@ struct hook { void *data; }; +enum hook_dup { // see hook_add() + HOOK_DUP_OK, + HOOK_DUP_SKIP, + HOOK_DUP_FAIL +}; + enum img_type { IMG_DIRECTORY, // normal directory, perhaps an external mount of some kind IMG_SQUASH, // SquashFS archive file (not yet mounted) @@ -63,7 +69,8 @@ struct container { /** Function prototypes **/ void containerize(struct container *c); -void hook_add(struct hook **hook_list, const char *name, hookf_t *f, void *d); +void hook_add(struct hook **hook_list, enum hook_dup dup, + const char *name, hookf_t *f, void *d); void hooks_run(struct container *c, struct hook **hook_list); enum img_type image_type(const char *ref, const char *images_dir); char *img_name2path(const char *name, const char *storage_dir); diff --git a/bin/json.c b/bin/json.c index a9122ed77..37015b1ba 100644 --- a/bin/json.c +++ b/bin/json.c @@ -200,6 +200,46 @@ char *cdi_hook_to_string(const char *hook_name, char **args) return ret; } +/* Update container configuration c according to CDI arguments given. Note + that here we just tidy up the configuration. Actually doing things (e.g. + bind mounts) happens later. */ +void cdi_init(struct container *c, char **devids) +{ + struct cdi_spec **specs = list_new(sizeof(struct cdi_spec *), 12); + + // read CDI spec files in configured directories, if requested + // FIXME + + // read CDI spec files specifically requested + for (size_t i = 0; devids[i] != NULL; i++) + if (devids[i][0] == '.' || devids[i][0] == '/') { + struct cdi_spec *spec = cdi_read(devids[i]); + list_append((void **)&specs, &spec, sizeof(spec)); + } + + // rm duplicate kinds + DEBUG("CDI: read %d specs", list_count(specs, sizeof(specs[0]))); + list_uniq(specs, sizeof(specs[0]), cdi_cmp_kind); + + // debugging: print parsed CDI specs + DEBUG("CDI: using %d specs", list_count(specs, sizeof(specs[0]))); + for (size_t i = 0; specs[i] != NULL; i++) + cdi_log(specs[0]); + + // update c + for (size_t i = 0; specs[i] != NULL; i++) { + // ldconfigs; copy rather than assigning because (1) easier to free + // and (2) still works if we later grow other sources of ldconfig. + list_cat((void **)&c->ldconfigs, (void *)specs[i]->ldconfigs, + sizeof(c->ldconfigs[0])); + } + + // clean up + for (size_t i = 0; specs[i] != NULL; i++) + cdi_free(specs[i]); + free(specs); +} + /* Log contents of spec. */ void cdi_log(struct cdi_spec *spec) { @@ -263,46 +303,6 @@ struct cdi_spec *cdi_read(const char *path) return spec; } -/* Update container configuration c according to CDI arguments given. Note - that here we just tidy up the configuration. Actually doing things (e.g. - bind mounts) happens later. */ -void cdi_update(struct container *c, char **devids) -{ - struct cdi_spec **specs = list_new(sizeof(struct cdi_spec *), 12); - - // read CDI spec files in configured directories, if requested - // FIXME - - // read CDI spec files specifically requested - for (size_t i = 0; devids[i] != NULL; i++) - if (devids[i][0] == '.' || devids[i][0] == '/') { - struct cdi_spec *spec = cdi_read(devids[i]); - list_append((void **)&specs, &spec, sizeof(spec)); - } - - // rm duplicate kinds - DEBUG("CDI: read %d specs", list_count(specs, sizeof(specs[0]))); - list_uniq(specs, sizeof(specs[0]), cdi_cmp_kind); - - // debugging: print parsed CDI specs - DEBUG("CDI: using %d specs", list_count(specs, sizeof(specs[0]))); - for (size_t i = 0; specs[i] != NULL; i++) - cdi_log(specs[0]); - - // update c - for (size_t i = 0; specs[i] != NULL; i++) { - // ldconfigs; copy rather than assigning because (1) easier to free - // and (2) still works if we later grow other sources of ldconfig. - list_cat((void **)&c->ldconfigs, (void *)specs[i]->ldconfigs, - sizeof(c->ldconfigs[0])); - } - - // clean up - for (size_t i = 0; specs[i] != NULL; i++) - cdi_free(specs[i]); - free(specs); -} - void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec) { DEBUG("CDI: %s: version %s", spec->src, tree->valuestring); diff --git a/bin/json.h b/bin/json.h index 139879536..16aaacfb6 100644 --- a/bin/json.h +++ b/bin/json.h @@ -18,4 +18,4 @@ /** Function prototypes **/ -void cdi_update(struct container *c, char ** devids); +void cdi_init(struct container *c, char ** devids); diff --git a/bin/misc.c b/bin/misc.c index 873997d0c..339aacd36 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -344,7 +344,7 @@ void env_set(const char *name, const char *value, const bool expand) } // Save results. - VERBOSE("environment: %s=%s", name, value); + DEBUG("environment: %s=%s", name, value); Z_ (setenv(name, value, 1)); free(vwk); } @@ -388,7 +388,7 @@ void envs_unset(const char *glob) T_ (name != NULL); // environ entries must always have equals matchp = fnmatch(glob, name, FNM_EXTMATCH); // extglobs if available if (matchp == 0) { - VERBOSE("environment: unset %s", name); + DEBUG("environment: unset %s", name); } else { T_ (matchp == FNM_NOMATCH); *(value - 1) = '='; // rejoin line @@ -902,17 +902,22 @@ unsigned long path_mount_flags(const char *path) | (sv.f_flag & ST_SYNCHRONOUS ? MS_SYNCHRONOUS : 0); } -/* Split path into dirname and basename. */ +/* Split path into dirname and basename. If dir and/or base is NULL, then skip + that output. */ void path_split(const char *path, char **dir, char **base) { char *path2; - T_ (path2 = strdup(path)); - T_ (*dir = strdup(dirname(path2))); - free(path2); - T_ (path2 = strdup(path)); - T_ (*base = strdup(basename(path2))); - free(path2); + if (dir != NULL) { + T_ (path2 = strdup(path)); + T_ (*dir = strdup(dirname(path2))); + free(path2); + } + if (base != NULL) { + T_ (path2 = strdup(path)); + T_ (*base = strdup(basename(path2))); + free(path2); + } } /* Return true if path is a subdirectory of base, false otherwise. Acts on the diff --git a/bin/misc.h b/bin/misc.h index 7987a210a..778edf2cd 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -10,6 +10,7 @@ #include #include #include +#include #include diff --git a/bin/seccomp.c b/bin/seccomp.c new file mode 100644 index 000000000..e027f957a --- /dev/null +++ b/bin/seccomp.c @@ -0,0 +1,258 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains the seccomp filter for root emulation. */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core.h" +#include "hook.h" + + +/** Macros **/ + +/* On some distros (e.g., CentOS 7), some of the architecture numbers are + missing. The workaround is to use the numbers I have on Debian Bullseye. + The reason I (Reid) feel moderately comfortable doing this is how militant + Linux is about not changing the userspace API. */ +#ifndef AUDIT_ARCH_AARCH64 +#define AUDIT_ARCH_AARCH64 0xC00000B7u // undeclared on CentOS 7 +#undef AUDIT_ARCH_ARM // uses undeclared EM_ARM on CentOS 7 +#define AUDIT_ARCH_ARM 0x40000028u +#endif + +/* Special values for seccomp tables. These must be negative to avoid clashing + with real syscall numbers (note zero is often a valid syscal number). */ +#define NR_NON -1 // syscall does not exist on architecture +#define NR_END -2 // end of table + +/** Constants **/ + +/* Architectures we support for seccomp. Order matches the table below. */ +int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64 + AUDIT_ARCH_ARM, // arm32 + AUDIT_ARCH_I386, // x86 (32-bit) + AUDIT_ARCH_PPC64LE, // PPC + AUDIT_ARCH_S390X, // s390x + AUDIT_ARCH_X86_64, // x86-64 + NR_END }; + +/* System call numbers that we fake with seccomp (by doing nothing and + returning success). Some processors can execute multiple architectures + (e.g., 64-bit Intel CPUs can run both x64-64 and x86 code), and a process’ + architecture can even change (if you execve(2) binary of different + architecture), so we can’t just use the build host’s architecture. + + I haven’t figured out how to gather these system call numbers + automatically, so they are compiled from [1, 2, 3]. See also [4] for a more + general reference. + + NOTE: The total number of faked syscalls (i.e., non-zero entries below) + must be somewhat less than 256. I haven’t computed the exact limit. There + will be an assertion failure at runtime if this is exceeded. + + WARNING: Keep this list consistent with the ch-image(1) man page! + + [1]: https://chromium.googlesource.com/chromiumos/docs/+/HEAD/constants/syscalls.md#Cross_arch-Numbers + [2]: https://github.com/strace/strace/blob/v4.26/linux/powerpc64/syscallent.h + [3]: https://github.com/strace/strace/blob/v6.6/src/linux/s390x/syscallent.h + [4]: https://unix.stackexchange.com/questions/421750 */ +int FAKE_SYSCALL_NRS[][6] = { + // arm64 arm32 x86 PPC64 s390x x86-64 + // ------ ------ ------ ------ ------ ------ + { 91, 185, 185, 184, 185, 126 }, // capset + { NR_NON, 182, 182, 181, 212, 92 }, // chown + { NR_NON, 212, 212, NR_NON, NR_NON, NR_NON }, // chown32 + { 55, 95, 95, 95, 207, 93 }, // fchown + { NR_NON, 207, 207, NR_NON, NR_NON, NR_NON }, // fchown32 + { 54, 325, 298, 289, 291, 260 }, // fchownat + { NR_NON, 16, 16, 16, 198, 94 }, // lchown + { NR_NON, 198, 198, NR_NON, NR_NON, NR_NON }, // lchown32 + { 104, 347, 283, 268, 277, 246 }, // kexec_load + { 152, 139, 139, 139, 216, 123 }, // setfsgid + { NR_NON, 216, 216, NR_NON, NR_NON, NR_NON }, // setfsgid32 + { 151, 138, 138, 138, 215, 122 }, // setfsuid + { NR_NON, 215, 215, NR_NON, NR_NON, NR_NON }, // setfsuid32 + { 144, 46, 46, 46, 214, 106 }, // setgid + { NR_NON, 214, 214, NR_NON, NR_NON, NR_NON }, // setgid32 + { 159, 81, 81, 81, 206, 116 }, // setgroups + { NR_NON, 206, 206, NR_NON, NR_NON, NR_NON }, // setgroups32 + { 143, 71, 71, 71, 204, 114 }, // setregid + { NR_NON, 204, 204, NR_NON, NR_NON, NR_NON }, // setregid32 + { 149, 170, 170, 169, 210, 119 }, // setresgid + { NR_NON, 210, 210, NR_NON, NR_NON, NR_NON }, // setresgid32 + { 147, 164, 164, 164, 208, 117 }, // setresuid + { NR_NON, 208, 208, NR_NON, NR_NON, NR_NON }, // setresuid32 + { 145, 70, 70, 70, 203, 113 }, // setreuid + { NR_NON, 203, 203, NR_NON, NR_NON, NR_NON }, // setreuid32 + { 146, 23, 23, 23, 213, 105 }, // setuid + { NR_NON, 213, 213, NR_NON, NR_NON, NR_NON }, // setuid32 + { NR_END }, // end +}; +int FAKE_MKNOD_NRS[] = + { NR_NON, 14, 14, 14, 14, 133 }; +int FAKE_MKNODAT_NRS[] = + { 33, 324, 297, 288, 290, 259 }; + + +/** Function prototypes (private) **/ + +void iw(struct sock_fprog *p, int i, + uint16_t op, uint32_t k, uint8_t jt, uint8_t jf); + + +/** Functions **/ + +/* Prestart hook to set up the fake-syscall seccomp(2) filter. This computes + and installs a long-ish but fairly simple BPF program to implement the + filter. To understand this rather hairy language: + + 1. https://man7.org/training/download/secisol_seccomp_slides.pdf + 2. https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html + 3. https://elixir.bootlin.com/linux/latest/source/samples/seccomp */ +void hook_seccomp_install(struct container *c, void *d) +{ + int arch_ct = sizeof(SECCOMP_ARCHS)/sizeof(SECCOMP_ARCHS[0]) - 1; + int syscall_cts[arch_ct]; + struct sock_fprog p = { 0 }; + int ii, idx_allow, idx_fake, idx_mknod, idx_mknodat, idx_next_arch; + // Lengths of certain instruction groups. These are all obtained manually + // by counting below, violating DRY. We could automate these counts, but it + // seemed like the cost of extra buffers and code to do that would exceed + // that of maintaining the manual counts. + int ct_jump_start = 4; // ld arch & syscall nr, arch test, end-of-arch jump + int ct_mknod_jump = 2; // jump table handling for mknod(2) and mknodat(2) + int ct_mknod = 2; // mknod(2) handling + int ct_mknodat = 6; // mknodat(2) handling + + // Count how many syscalls we are going to fake in the standard way. We + // need this to compute the right offsets for all the jumps. + for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { + p.len += ct_jump_start + ct_mknod_jump; + syscall_cts[ai] = 0; + for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { + bool syscall_p = FAKE_SYSCALL_NRS[si][ai] != NR_NON; + syscall_cts[ai] += syscall_p; + p.len += syscall_p; // syscall jump table entry + } + } + + // Initialize program buffer. + p.len += ( 1 // return allow + + 1 // return fake success + + ct_mknod // mknod(2) handling + + ct_mknodat); // mknodat(2) handling + DEBUG("seccomp: filter program has %d instructions", p.len); + T_ (p.filter = calloc(p.len, sizeof(struct sock_filter))); + + // Return call addresses. Allow needs to come first because we’ll jump to + // it for unknown architectures. + idx_allow = p.len - 2 - ct_mknod - ct_mknodat; + idx_fake = p.len - 1 - ct_mknod - ct_mknodat; + idx_mknod = p.len - ct_mknod - ct_mknodat; + idx_mknodat = p.len - ct_mknodat; + + // Build a jump table for each architecture. The gist is: if architecture + // matches, fall through into the jump table, otherwise jump to the next + // architecture (or ALLOW for the last architecture). + ii = 0; + idx_next_arch = -1; // avoid warning on some compilers + for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { + int jump; + idx_next_arch = ii + syscall_cts[ai] + ct_jump_start + ct_mknod_jump; + // load arch into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, arch), 0, 0); + // jump to next arch if arch doesn't match + jump = idx_next_arch - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, SECCOMP_ARCHS[ai], 0, jump); + // load syscall number into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr), 0, 0); + // jump table of syscalls + for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { + int nr = FAKE_SYSCALL_NRS[si][ai]; + if (nr != NR_NON) { + jump = idx_fake - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, nr, jump, 0); + } + } + // jump to mknod(2) handling (add even if syscall not implemented to + // make the instruction counts simpler) + jump = idx_mknod - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNOD_NRS[ai], jump, 0); + // jump to mknodat(2) handling + jump = idx_mknodat - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNODAT_NRS[ai], jump, 0); + // unfiltered syscall, jump to allow (limit of 255 doesn’t apply to JA) + jump = idx_allow - ii - 1; + iw(&p, ii++, BPF_JMP|BPF_JA, jump, 0, 0); + } + T_ (idx_next_arch == idx_allow); + + // Returns. (Note that if we wanted a non-zero errno, we’d bitwise-or with + // SECCOMP_RET_ERRNO. But because fake success is errno == 0, we don’t need + // a no-op “| 0”.) + T_ (ii == idx_allow); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); + T_ (ii == idx_fake); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); + + // mknod(2) handling. This just loads the file mode and jumps to the right + // place in the mknodat(2) handling. + T_ (ii == idx_mknod); + // load mode argument into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, args[1]), 0, 0); + // jump to mode test + iw(&p, ii++, BPF_JMP|BPF_JA, 1, 0, 0); + + // mknodat(2) handling. + T_ (ii == idx_mknodat); + // load mode argument into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, args[2]), 0, 0); + // jump to fake return if trying to create a device. + iw(&p, ii++, BPF_ALU|BPF_AND|BPF_K, S_IFMT, 0, 0); // file type only + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFCHR, 2, 0); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFBLK, 1, 0); + // returns + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); + + // Install filter. Use prctl(2) rather than seccomp(2) for slightly greater + // compatibility (Linux 3.5 rather than 3.17) and because there is a glibc + // wrapper. + T_ (ii == p.len); // next instruction now one past the end of the buffer + Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p)); + DEBUG("seccomp: see contributor's guide to disassemble") + + // Test filter. This will fail if the kernel executes the call (because we + // are not really privileged and the arguments are bogus) or succeed if + // filter handles it. We selected it over something more naturally in the + // filter, e.g. setuid(2), because (1) no container process should ever use + // it and (2) it’s unlikely to be emulated by a smarter filter in the + // future, i.e., it won’t silently start doing something. + Zf (syscall(SYS_kexec_load, 0, 0, NULL, 0), + "seccomp root emulation failed (is your architecture supported?)"); +} + +/* Helper function to write seccomp-bpf programs. */ +void iw(struct sock_fprog *p, int i, + uint16_t op, uint32_t k, uint8_t jt, uint8_t jf) +{ + p->filter[i] = (struct sock_filter){ op, jt, jf, k }; +} + diff --git a/bin/seccomp.h b/bin/seccomp.h index ed46e93a5..821a646ee 100644 --- a/bin/seccomp.h +++ b/bin/seccomp.h @@ -7,4 +7,4 @@ #include "core.h" -void rootemu_init(struct container *c); +void hook_seccomp_install(struct container *c, void *d); From a1fe68dfc6b1ebc333310b287c69089d5c740779 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 29 Aug 2024 15:27:34 -0600 Subject: [PATCH 21/29] snapshot because I got fed up with manual memory management --- bin/ch-run.c | 67 +++++++++++++----- bin/json.c | 181 ++++++++++++++++++++++++++++++++++++++----------- bin/json.h | 12 +++- bin/misc.c | 94 +++++++++++++++++++++++-- bin/misc.h | 6 +- doc/ch-run.rst | 20 +++--- 6 files changed, 305 insertions(+), 75 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 569f3ee6c..79996f97a 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -27,11 +27,12 @@ /** Types **/ enum env_option_type { - ENV_END = 0, // list terminator sentinel - ENV_SET, // --set-env - ENV_SET0, // --set-env0 - ENV_UNSET, // --unset-env - ENV_CDI_DEV, // --device + ENV_END = 0, // list terminator sentinel + ENV_SET, // --set-env + ENV_SET0, // --set-env0 + ENV_UNSET, // --unset-env + ENV_CDI_DEV, // --device (specific device) + ENV_CDI_ALL, // --devices (all known devices) }; struct env_option { @@ -42,7 +43,7 @@ struct env_option { struct args { struct container c; #ifdef HAVE_JSON - char **cdi_devids; + struct cdi_config cdi; #endif struct env_option *env_options; enum log_color_when log_color; @@ -107,6 +108,8 @@ const char args_doc[] = "IMAGE -- COMMAND [ARG...]"; /* Note: Long option numbers, once issued, are permanent; i.e., if you remove one, don’t re-number the others. */ const struct argp_option options[] = { + { "abort-fatal", -21, 0, 0, + "exit abnormally on error, maybe dumping core" }, { "bind", 'b', "SRC[:DST]", 0, "mount SRC at guest DST (default: same as SRC)"}, { "cd", 'c', "DIR", 0, "initial working directory in container"}, @@ -225,7 +228,12 @@ int main(int argc, char *argv[]) .writable = false }, #ifdef HAVE_JSON - .cdi_devids = list_new(sizeof(char *), 0), + .cdi = (struct cdi_config){ + .spec_dirs = list_new_strings(':', env_get("CH_RUN_CDI_DIRS", + "/etc/cdi:/var/run/cdi")), + .devs_all_p = false, + .devids = list_new(sizeof(char *), 0), + }, #endif .env_options = list_new(sizeof(struct env_option), 0), .initial_dir = NULL, @@ -289,10 +297,7 @@ int main(int argc, char *argv[]) for (int i = 0; i < argc - arg_next; i++) c_argv[i] = argv[i + arg_next]; - if (getenv("TMPDIR") != NULL) - host_tmp = getenv("TMPDIR"); - else - host_tmp = "/tmp"; + host_tmp = env_get("TMPDIR", "/tmp"); // global in misc.c VERBOSE("verbosity: %d", verbose); VERBOSE("image: %s", args.c.img_ref); @@ -302,6 +307,7 @@ int main(int argc, char *argv[]) VERBOSE("container gid: %u", args.c.container_gid); VERBOSE("join: %d %d %s %d", args.c.join, args.c.join_ct, args.c.join_tag, args.c.join_pid); + VERBOSE("host $TMPDIR: %s", host_tmp); VERBOSE("private /tmp: %d", args.c.private_tmp); #ifdef HAVE_SECCOMP VERBOSE("seccomp: %s", bool_to_string(args.seccomp_p)); @@ -309,9 +315,10 @@ int main(int argc, char *argv[]) VERBOSE("unsafe: %s", bool_to_string(args.unsafe)); #ifdef HAVE_JSON - cdi_init(&args.c, args.cdi_devids); + cdi_init(&args.cdi); #endif hooks_env_install(&args); + //cdi_hook_ldconfig_install(&args.c.hook_prestart, &args.cdi); containerize(&args.c); run_user_command(c_argv, args.initial_dir); // should never return @@ -420,8 +427,12 @@ void hooks_env_install(struct args *args) case ENV_CDI_DEV: name = "env-set-cdi"; f = hook_envs_set; - //d = cdi_envs_get(arg); + d = cdi_envs_get(arg); break; + case ENV_CDI_ALL: + name = "env-set-cdi-all"; + f = hook_envs_set; + d = cdi_envs_get(NULL); case ENV_END: T_ (false); // unreachable break; @@ -610,10 +621,19 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) FATAL("invalid --test argument: %s; see source code", arg); break; #ifdef HAVE_JSON - case -18: // --device - Te (strlen(arg) > 0, "--device: DEV must be longer than zero"); - write_fake_enable(args, NULL); - list_append((void **)&(args->cdi_devids), &arg, sizeof(arg)); + case -18: { // --device + struct env_option ope; + Te (strlen(arg) > 0, "--device: DEV must be non-empty"); + write_fake_enable(args, NULL); + list_append((void **)&args->cdi.devids, &arg, sizeof(arg)); + ope.opt = ENV_CDI_DEV; + ope.arg = arg; + list_append((void **)&args->env_options, &ope, sizeof(ope)); + } break; + case -19: // --cdi-dirs + Te (strlen(arg) > 0, "--cdi-dirs: PATHS must be non-empty"); + list_free_shallow((void ***)&args->cdi.spec_dirs); + args->cdi.spec_dirs = list_new_strings(':', arg); break; #endif case -20: // --color @@ -630,6 +650,9 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) } Tf (args->log_color != LL_COLOR_NULL, "--color: invalid arg: %s", arg); break; + case -21: // --abort-fatal + abort_fatal = true; // in misc.c + break; case 'b': { // --bind char *src, *dst; for (i = 0; args->c.binds[i].src != NULL; i++) // count existing binds @@ -654,6 +677,16 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) case 'c': // --cd args->initial_dir = arg; break; +#ifdef HAVE_JSON + case 'd': { // --devices + // Can’t add the devices here b/c we don’t know the CDI spec dirs yet. + struct env_option ope; + args->cdi.devs_all_p = true; + ope.opt = ENV_CDI_ALL; + ope.arg = NULL; + list_append((void **)&args->env_options, &ope, sizeof(ope)); + } break; +#endif case 'g': // --gid i = parse_int(arg, false, "--gid"); Te (i >= 0, "--gid: must be non-negative"); diff --git a/bin/json.c b/bin/json.c index 37015b1ba..6e04dbafd 100644 --- a/bin/json.c +++ b/bin/json.c @@ -1,9 +1,12 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#include #include #include #include +#include +#include #include "config.h" @@ -35,10 +38,12 @@ struct cdi_hook_dispatch { struct cdi_spec { char *kind; - char *src; // path to source spec file + char *src_path; // source spec file path + dev_t src_dev; // ... device ID + ino_t src_ino; // ... inode number struct env_var *envs; struct bind *binds; - char **ldconfigs; // directories to process with ldconfig(8) + char **ldconfigs; // directories to process with ldconfig(8) }; struct json_dispatch { @@ -54,16 +59,26 @@ struct json_dispatch { // Block size in bytes for reading JSON files. const size_t READ_SZ = 16384; +/** Globals **/ + +// List of CDI specs we’ve read. Yes it’s a global, but that lets us keep +// struct cdi_spec private to this file, which seemed like the right +// trade-off. It also seemed like “all the specs we know about” wasn’t +// something we needed multiple of. +struct cdi_spec *cdi_specs = NULL; + /** Function prototypes (private) **/ char **array_strings_json_to_c(cJSON *jarry, size_t *ct); -int cdi_cmp_kind(const void *a, const void *b); +void cdi_append(struct cdi_spec **specs, struct cdi_spec *spec); void cdi_free(struct cdi_spec *spec); void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args); char *cdi_hook_to_string(const char *hook_name, char **args); void cdi_log(struct cdi_spec *spec); struct cdi_spec *cdi_read(const char *path); +struct cdi_spec *cdi_read_maybe(struct cdi_spec *specs, const char *path); +bool cdi_requested(struct cdi_config *cf, struct cdi_spec *spec); void visit(struct json_dispatch actions[], cJSON *tree, void *state); void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); @@ -139,25 +154,34 @@ char **array_strings_json_to_c(cJSON *jarry, size_t *ct) return carry; } -/* Compare the kinds of specifications a and b (which are double pointers, - hence the hairy casts). As expected by qsort(3): +/* Return true if devid is a device kind (e.g. “nvidia.com/gpu”), false if + it’s a path. Exit with error if NULL pointer or empty string. */ +bool cdi_devid_kind_p(const char *devid) +{ + T_ (devid != NULL && devid[0] != '\0'); + return (devid[0] != '.' && devid[0] != '/'); +} - if a < b: return negative value - if a = b: return 0 - if a > b: return positive value */ -int cdi_cmp_kind(const void *a, const void *b) +/* Return a list of environment variables to be set for device kind kind, or + if kind is NULL, all known devices. Both the list and the buffers within + are newly allocated; the caller must free the list with envs_free(). */ +struct env_var *cdi_envs_get(const char *devid) { - struct cdi_spec *a_ = *(struct cdi_spec **)a; - struct cdi_spec *b_ = *(struct cdi_spec **)b; + struct env_var *vars; - return strcmp(a_->kind, b_->kind); + // count variables so we can do just one allocation + for () + + // set up the list + + return vars; } /* Free spec. */ void cdi_free(struct cdi_spec *spec) { free(spec->kind); - free(spec->src); + free(spec->src_path); for (size_t i = 0; spec->envs[i].name != NULL; i++) { free(spec->envs[i].name); free(spec->envs[i].value); @@ -200,32 +224,58 @@ char *cdi_hook_to_string(const char *hook_name, char **args) return ret; } -/* Update container configuration c according to CDI arguments given. Note - that here we just tidy up the configuration. Actually doing things (e.g. - bind mounts) happens later. */ -void cdi_init(struct container *c, char **devids) -{ - struct cdi_spec **specs = list_new(sizeof(struct cdi_spec *), 12); +/* Read the CDI spec files we need. - // read CDI spec files in configured directories, if requested - // FIXME - - // read CDI spec files specifically requested - for (size_t i = 0; devids[i] != NULL; i++) - if (devids[i][0] == '.' || devids[i][0] == '/') { - struct cdi_spec *spec = cdi_read(devids[i]); - list_append((void **)&specs, &spec, sizeof(spec)); + Note: We only read spec files in the search path directories if either + (a) --devices is specified, requesting all known devices or (b) a device + kind (rather than a filename) is given to --device (e.g., “nvidia.com/gpu”. + This protects users from errors in the spec files if they have not + requested any CDI features. */ +void cdi_init(struct cdi_config *cf) +{ + bool req_by_kind = false; + + // Initialize specs list. + T_ (cdi_specs == NULL); + cdi_specs = list_new(sizeof(struct cdi_spec), 0); + + // Read CDI spec files specifically requested. + for (int i = 0; cf->devids[i] != NULL; i++) + if (cdi_devid_kind_p(cf->devids[i])) + req_by_kind = true; + else { + struct cdi_spec *spec = cdi_read_maybe(cdi_specs, cf->devids[i]); + if (spec != NULL) + list_append((void **)&cdi_specs, spec, sizeof(*spec)); + free(spec); } - // rm duplicate kinds - DEBUG("CDI: read %d specs", list_count(specs, sizeof(specs[0]))); - list_uniq(specs, sizeof(specs[0]), cdi_cmp_kind); + // Read CDI spec files in configured directories if neccessary. + if (cf->devs_all_p || req_by_kind) + for (int i = 0; cf->spec_dirs[i] != NULL; i++) { + int entry_ct; + struct dirent **des; + entry_ct = dir_ls(cf->spec_dirs[i], &des); + for (int j = 0; j < entry_ct; j++) { + if (!fnmatch("*.json", des[i]->d_name, 0)) { + char *path = path_join(cf->spec_dirs[i], des[i]->d_name); + struct cdi_spec *spec = cdi_read_maybe(cdi_specs, path); + if (spec != NULL && cdi_requested(cf, spec)) + list_append((void **)&cdi_specs, spec, sizeof(*spec)); + free(path); + free(spec); + } + free(des[j]); + } + free(des); + } // debugging: print parsed CDI specs - DEBUG("CDI: using %d specs", list_count(specs, sizeof(specs[0]))); - for (size_t i = 0; specs[i] != NULL; i++) - cdi_log(specs[0]); + DEBUG("CDI: read %d specs", list_count(cdi_specs, sizeof(cdi_specs[0]))); + for (size_t i = 0; cdi_specs[i].kind != NULL; i++) + cdi_log(&cdi_specs[0]); +/* // update c for (size_t i = 0; specs[i] != NULL; i++) { // ldconfigs; copy rather than assigning because (1) easier to free @@ -233,11 +283,7 @@ void cdi_init(struct container *c, char **devids) list_cat((void **)&c->ldconfigs, (void *)specs[i]->ldconfigs, sizeof(c->ldconfigs[0])); } - - // clean up - for (size_t i = 0; specs[i] != NULL; i++) - cdi_free(specs[i]); - free(specs); +*/ } /* Log contents of spec. */ @@ -245,7 +291,8 @@ void cdi_log(struct cdi_spec *spec) { size_t ct; - DEBUG("CDI: %s from %s:", spec->kind, spec->src); + DEBUG("CDI: %s from %s (%u,%u %u):", spec->kind, spec->src_path, + major(spec->src_dev), minor(spec->src_dev), spec->src_ino); ct = list_count((void *)(spec->envs), sizeof(struct env_var)); DEBUG("CDI: environment: %d:", ct); for (size_t i = 0; i < ct; i++) @@ -266,6 +313,7 @@ void cdi_log(struct cdi_spec *spec) struct cdi_spec *cdi_read(const char *path) { FILE *fp; + struct stat st; char *text = NULL; const char *parse_end; cJSON *tree; @@ -274,6 +322,7 @@ struct cdi_spec *cdi_read(const char *path) // Read file into string. Allocate incrementally rather than seeking so // non-seekable input works. Tf (fp = fopen(path, "rb"), "CDI: can't open: %s", path); + Zf (fstat(fileno(fp), &st), "CDI: can't stat: %s", path); for (size_t used = 0, avail = READ_SZ; true; avail += READ_SZ) { T_ (text = realloc(text, avail)); size_t read_ct = fread(text + used, 1, READ_SZ, fp); @@ -293,7 +342,9 @@ struct cdi_spec *cdi_read(const char *path) // Visit parse tree to build our struct. T_ (spec = calloc(1, sizeof(struct cdi_spec))); - T_ (spec->src = strdup(path)); + T_ (spec->src_path = strdup(path)); + spec->src_dev = st.st_dev; + spec->src_ino = st.st_ino; visit(cdiPD_root, tree, spec); // Clean up. @@ -303,9 +354,57 @@ struct cdi_spec *cdi_read(const char *path) return spec; } +/* Read and parse the CDI spec file at path, returning a pointer to the + newly-allocated spec struct, unless (1) we already read the file, in which + case log that fact and return NULL, or (2) the device kind has already been + specified, in which case exit with error. If something else goes wrong, + also exit with error. */ +struct cdi_spec *cdi_read_maybe(struct cdi_spec *specs, const char *path) +{ + struct cdi_spec *spec; + struct stat st; + + // Don’t read file if we already did. It’s relatively easy to give a spec + // file more than once, e.g. if it’s in the search path and also an + // argument to --device. + for (int i = 0; specs[i].kind != NULL; i++) { + Zf (stat(path, &st), "can’t stat CDI spec: %s", path); + if (st.st_dev == specs[i].src_dev && st.st_ino == specs[i].src_ino) { + VERBOSE("CDI: spec already read, skipping: %s", path); + return NULL; + } + } + + spec = cdi_read(path); + + // Error if this device already specified, which because we don’t re-read + // files means two files specified the same device kind. + for (int i = 0; specs[i].kind != NULL; i++) + Te (strcmp(spec->kind, specs[i].kind), + "CDI: device found in multiple spec files: %s: %s and %s", + spec->kind, specs[i].src_path, spec->src_path); + + return spec; +} + +/* Return true if the given spec was requested by configuration cf, false + otherwise. */ +bool cdi_requested(struct cdi_config *cf, struct cdi_spec *spec) +{ + if (cf->devs_all_p) + return true; + + for (int i; cf->devids[i] != NULL; i++) + if ( cdi_devid_kind_p(cf->devids[i]) + && !strcmp(cf->devids[i], spec->kind)) + return true; + + return false; +} + void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec) { - DEBUG("CDI: %s: version %s", spec->src, tree->valuestring); + DEBUG("CDI: %s: version %s", spec->src_path, tree->valuestring); } void cdiPC_env(cJSON *tree, struct cdi_spec *spec) diff --git a/bin/json.h b/bin/json.h index 16aaacfb6..4b86a7660 100644 --- a/bin/json.h +++ b/bin/json.h @@ -6,6 +6,8 @@ #define _GNU_SOURCE #pragma once +#include + #include "config.h" #include "core.h" #include "misc.h" @@ -15,7 +17,15 @@ /** Types **/ +/* General CDI configuration. */ +struct cdi_config { + char **spec_dirs; // directories to search for CDI spec files + bool devs_all_p; // inject all devices found + char **devids; // user-requested devices +}; + /** Function prototypes **/ -void cdi_init(struct container *c, char ** devids); +void cdi_envs_get(const char *devid); +void cdi_init(struct cdi_config *cf); diff --git a/bin/misc.c b/bin/misc.c index 339aacd36..cf5317bd4 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -64,8 +64,8 @@ static const char **LL_COLOURS = _LL_COLOURS + 3; /** External variables **/ -/* Level of chatter on stderr. */ -enum log_level verbose; +/* If true, exit abnormally on fatal error. Set in ch-run.c. */ +bool abort_fatal = false; /* If true, use colored logging. Set in ch-run.c. */ bool log_color_p = false; @@ -76,6 +76,9 @@ char *host_tmp = NULL; /* Username of invoking users. Set during command line processing. */ char *username = NULL; +/* Level of chatter on stderr. */ +enum log_level verbose; + /* List of warnings to be re-printed on exit. This is a buffer of shared memory allocated by mmap(2), structured as a sequence of null-terminated character strings. Warnings that do not fit in this buffer will be lost, though we @@ -89,6 +92,7 @@ char *warnings; size_t warnings_offset = 0; + /** Function prototypes (private) **/ void mkdir_overmount(const char *path, const char *scratch); @@ -241,7 +245,7 @@ int dir_ls(const char *path, struct dirent ***namelist) int entry_ct; entry_ct = scandir(path, namelist, dir_ls_filter, NULL); - Tf (entry_ct >= 0, "can't scan dir", path); + Tf (entry_ct >= 0, "can't scan dir: %s", path); return entry_ct; } @@ -310,6 +314,20 @@ struct env_var *env_file_read(const char *path, int delim) return vars; } +/* Return the value of environment variable name if set; otherwise, return + value_default instead. + + Note the implications for memory management: you may get a pointer into + environ (?), which you do not own and must not free, or value_default, + which may or may not need to be freed. */ +char *env_get(const char *name, char *value_default) +{ + char *ret = getenv(name); + return ret ? ret : value_default; +} + + + /* Set environment variable name to value. If expand, then further expand variables in value marked with "$" as described in the man page. */ void env_set(const char *name, const char *value, const bool expand) @@ -485,6 +503,18 @@ size_t list_count(void *ar, size_t size) return ct; } +/* *ar is a list of pointers to malloc()’ed buffers (which is why object size + is not provided). Free those buffers, then free *ar itself and set it to + NULL. */ +void list_free_shallow(void ***ar) +{ + T_ (*ar != NULL); + for (int i; (*ar)[i] != NULL; i++) + free((*ar)[i]); + free(*ar); + *ar = NULL; +} + /* Return a pointer to a new, empty zero-terminated array containing elements of size size, with room for ct elements without re-allocation. The latter allows to pre-allocate an arbitrary number of slots in the list, which can @@ -494,10 +524,63 @@ size_t list_count(void *ar, size_t size) void *list_new(size_t size, size_t ct) { void *list; + T_ (size > 0); T_ (list = calloc(ct+1, size)); return list; } +/* Split str into tokens delimited by delim (multiple adjacent delimiters are + treated as one). Copy each token into a newly-allocated string buffer, and + return these strings as a new list. + + Notes: + + 1. The interface deliberately accepts a single delimiter, not multiple + like strtok(3). + + 2. This approach has a redundant malloc(3) for each token, because we + have to copy the input string into a new buffer anyway to satisfy + strtok_r(3). We could use the multiple token pointers into this single + buffer as the list elements. However, this would yield a + difficult-to-free list: one would have to free only the *first* + element in the list and no others. Also, if any other strings are + later added to the list, those would need to be freed differently. + This all seemed extremely bug-prone. */ +void *list_new_strings(char delim, const char *str) +{ + char **list; + char *str_copy, *str_init, *tok_state; + char delims[] = { delim, '\0' }; + size_t delim_ct = 0; + + // Count delimiters so we can allocate the right size list initially, + // avoiding one realloc() per delimiter. Note this does not account for + // adjacent delimiters and thus may overcount tokens, possibly wasting a + // small amount of memory. + for (int i = 0; str[i] != '\0'; i++) + delim_ct += str[i] == delim ? 1 : 0; + + list = list_new(delim_ct + 1, sizeof(char *)); + + // Note: strtok_r(3)’s interface is rather awkward; see its man page. + T_ (str_copy = strdup(str)); + str_init = str_copy; + tok_state = NULL; + for (int i = 0; true; i++) { + char *tok; + tok = strtok_r(str_init, delims, &tok_state); + if (tok == NULL) + break; + T_ (i < delim_ct + 1); // bounds check + T_ (tok = strdup(tok)); // copy tok into buffer we own + list[i] = tok; + str_init = NULL; + } + free(str_copy); + + return list; +} + /* Remove any duplicate elements in ar, in-place, according to comparison function cmp. The last duplicate in the list wins. Preserves order otherwise. */ @@ -743,7 +826,10 @@ noreturn void msg_fatal(const char *file, int line, int errno_, msgv(LL_FATAL, file, line, errno_, fmt, ap); va_end(ap); - exit(EXIT_FAILURE); + if (abort_fatal) + abort(); + else + exit(EXIT_FAILURE); } /* va_list form of msg(). */ diff --git a/bin/misc.h b/bin/misc.h index 778edf2cd..aec855943 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -108,10 +108,11 @@ enum log_test { LL_TEST_NONE = 0, /** External variables **/ -extern enum log_level verbose; +extern bool abort_fatal; extern bool log_color_p; extern char *host_tmp; extern char *username; +extern enum log_level verbose; extern char *warnings; extern size_t warnings_offset; @@ -127,6 +128,7 @@ int dir_ls(const char *path, struct dirent ***namelist); int dir_ls_count(const char *path); int dir_ls_filter(const struct dirent *e); struct env_var *env_file_read(const char *path, int delim); +char *env_get(const char *name, char *value_default); void env_set(const char *name, const char *value, const bool expand); void envs_free(struct env_var **vars); void envs_set(const struct env_var *envs, const bool expand); @@ -135,6 +137,8 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); void list_cat(void **dst, void *src, size_t size); size_t list_count(void *ar, size_t size); +void list_free_shallow(void ***ar); +void *list_new_strings(char delim, const char *s); void *list_new(size_t size, size_t ct); void list_uniq(void *ar, size_t size, comparison_fn_t cmp); void log_ids(const char *func, int line); diff --git a/doc/ch-run.rst b/doc/ch-run.rst index ec9d63876..8622a3b4e 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -56,7 +56,7 @@ mounting SquashFS images with FUSE. :code:`-c`, :code:`--cd=DIR` Initial working directory in container. - :code:`--cdi-dirs=DIRS` + :code:`--cdi-dirs=PATHS` Colon-separated list of directories to search for CDI JSON specifications. Default: :code:`CH_RUN_CDI_DIRS` if set, otherwise :code:`/etc/cdi:/var/run/cdi`. @@ -79,16 +79,14 @@ mounting SquashFS images with FUSE. should work on all modern terminals. :code:`-d`, :code:`--devices` - Inject default CDI devices into the container. The default devices are - those listed in :code:`CH_RUN_CDI_DEFAULT` if set, otherwise all devices - for which a specification is found. Implies :code:`--write-fake`. - - :code:`--device=DEV[,DEV]` - Inject CDI device(s) identified by comma-separated :code:`DEV`. These are - either (1) a filename, if :code:`DEV` starts with a slash (:code:`/`) or - dot (:code:`.`), e.g. :code:`/etc/cdi/nvidia.json`, or (2) a CDI selector - for a list of devices in a CDI specification file, e.g. - :code:`nvidia.com/gpu`. Specific devices may not be selected, e.g. + Inject all CDI devices for which a specification is found. Implies + :code:`--write-fake`. + + :code:`--device=DEV` + Inject CDI device :code:`DEV`, either (1) a filename, if it starts with a + slash (:code:`/`) or dot (:code:`.`), e.g. :code:`/etc/cdi/nvidia.json`, + or (2) a CDI selector for a list of devices in a CDI specification file, + e.g. :code:`nvidia.com/gpu`. Specific devices may not be selected, e.g. :code:`nvidia.com/gpu=1:0` is invalid (see below for why). Implies :code:`--write-fake`. Can be repeated. From 2ef2430a54743f796f9fb44f28a4f1a16a837ca4 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 5 Sep 2024 17:19:15 -0600 Subject: [PATCH 22/29] document new memory management strategy [skip ci] --- doc/dev.rst | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/doc/dev.rst b/doc/dev.rst index 8458274b0..380952983 100644 --- a/doc/dev.rst +++ b/doc/dev.rst @@ -941,6 +941,56 @@ characters. C code ------ +Memory management +~~~~~~~~~~~~~~~~~ + +*TL;DR:* Charliecloud does not call :code:`free(3)`. + +*How-To:* (1) Use Charliecloud wrappers for all library functions that +allocate memory, e.g. :code:`ch_malloc()` instead of :code:`malloc(3)`. +Importantly, this includes things like :code:`strdup(3)` makeand +:code:`asprintf(3)`. (2) Don’t call :code:`free(3)` or any other library +functions that free memory. + +:code:`ch-run.c` has, since `very nearly the beginning +`_, carried the notice +that it “does not bother to free memory allocations, since they are modest and +the program is short-lived”. Explicit memory management is difficult and +time-consuming, and it didn’t seem worth the effort. + +Eventually, we grew a `long-running process +`_ to serve a +SquashFUSE filesystem, and the short-lived justification became obsolete. The +rough goal became: convert to proper memory management, freeing everything +that we allocated. Various :code:`free(3)` crept in here and there, but a full +refactor was never a priority. + +Then `PR #1919 `_ came to be +and grew in scope until it was a significant refactor. We tried to Do It Right +on memory management everywhere this PR touched, and we did, until Reid got +fed up writing comments about whose problem it was to free this or that and +copying data simply so those comments could be tractable. + +So now we’re back full circle. Memory management is not worth Charliecloud +developers’ time. We gleefully :code:`malloc(3)` and :code:`realloc(3)` +without a care in the world, sinning every time. But now you have options. You +can either: + +1. YOLO, i.e. simply never free anything, i.e. leak like a sieve. But + Charliecloud is still a small program and it’s unlikely to be an actual + problem. Quick-and-dirty tests show a main :code:`ch-run` process using + **FIXME** MiB just before it executes the user program, and the SquashFUSE + process **FIXME** MiB upon exit. + +2. Link with :code:`libgc`, i.e. the `Boehm-Demers-Weiser + `_ conservative garbage collector. The idea is + that garbage collection scans the stack, heap, and other pointer sources + for integers that *look* like pointers and assumes they *are* pointers. + Apparently it `works quite well `_ and + can even be faster than explicit memory management in some cases. The + quick-and-dirty tests show **FIXME** MiB by the main process, and the + SquashFUSE process **FIXME** just after forking and **FIXME** upon exit. + :code:`const` ~~~~~~~~~~~~~ From 878703e2f44ed3007f48c44cc903495c99ab8236 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 12 Sep 2024 15:26:43 -0600 Subject: [PATCH 23/29] snapshot of non-zeroing mode --- bin/mem.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ bin/mem.h | 18 ++++ 2 files changed, 269 insertions(+) create mode 100644 bin/mem.c create mode 100644 bin/mem.h diff --git a/bin/mem.c b/bin/mem.c new file mode 100644 index 000000000..efdead799 --- /dev/null +++ b/bin/mem.c @@ -0,0 +1,251 @@ +/* Re. zeroing newly-allocated memory: + + Because we use a lot of zero-terminated data structures, it would be nice + for the allocation functions to return zeroed buffers. We also want to not + require libgc, i.e., we want to still be able to use malloc(3) and + realloc(3) under the hood. It’s easy to provide a zeroing + malloc(3)-workalike, but as far as I can tell, it’s impossible to do so for + realloc(3)-alike unless we either (1) maintain our own allocation size + tracking or (2) use highly non-portable code. Neither of these seemed worth + the effort and complexity. + + This is because, as it turns out, the length of an allocated buffer is a + more complicated notion than it seems. A buffer has *two* different + lengths: L1 is the size requested by the original caller, and L2 is the + size actually allocated; L2 ≥ L1. Neither are reliably available: + + * L1: The allocator can’t provide it, and while the caller had it at the + time of previous allocation, it might not have kept it. + + * L2: Not available from the libc allocator without fairly extreme + non-portability and/or difficult constraints [1], though libgc does + provide it with GC_size(). The caller never knew it. + + Suppose we call realloc() with a new length Lν, where Lν > L2 ≥ L1. To zero + the new part of the buffer, we must zero (L1,Lν], or (L2,Lν] if we assume + (L1,L2] are still zero from the initial malloc(), and leave prior bytes + untouched. But we don’t know either L1 or L2 reliably, so we’re hosed, + whether we call an upstream realloc() or malloc() an entirely new buffer, + then memcpy(3). + + I suspect this is why libc provides calloc(3) but not an equivalent for + realloc(3). + + [1]: https://stackoverflow.com/questions/1281686 */ + +#define _GNU_SOURCE + +#include +#include +#include + +#include "config.h" +#include "mem.h" +#include "misc.h" + + +/** Macros **/ + +/** Types **/ + +/** Constants **/ + +/** Globals **/ + +/* Size of the stack and heap at previous ch_memory_log() call. These are + signed to avoid subtraction gotchas. */ +ssize_t stack_prev = 0; +ssize_t heap_prev = 0; + + +/** Functions **/ + + +/* Return a snprintf(3)-formatted string in a newly allocated buffer of + appropriate length. Exit on error. + + This function formats the string twice: Once to figure out how long the + formatted string is, and again to actually format the string. I’m not aware + of a better way to compute string length. (musl does it the same way; glibc + was too complicated for my patience in figuring it out.) + + An alternative would be to allocate a small buffer, try that, and if it’s + too small re-allocate and format again. For strings that fit, this would + save a formatting cycle at the cost of wasted memory and more code paths. + That didn’t seem like the right trade-off, esp. since short strings should + be the fastest to format. */ +char *ch_asprintf(const char *fmt, ...) +{ + va_list ap; + int str_len; + char *str; + + va_start(ap, fmt); + + T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap))); + str = ch_malloc(str_len + 1, false); + T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap)); + + va_end(ap); + + return str; +} + +/* Return a new null-terminated string containing the next record from fp, + where records are delimited by delim (e.g., pass '\n' to get the next + line). If no more records available, return NULL. Exit on error. + + Unlike getdelim(3), the delimiter is *not* part of the returned string. + + Warnings: + + 1. Records cannot contain the zero byte, and behavior is undefined if fp + containes any zeros and delimiter is not '\0'. + + 2. The returned buffer is likely larger than needed. We assume wasting + this space is better than the overhead of realloc’ing down to a + precise size. */ +char *ch_getdelim(FILE *fp, char delim) +{ + size_t bytes_read = 0; + size_t buf_len = 8; // non-zero start avoids early frequent realloc + char *buf = ch_malloc(buf_len, false); + + while (true) { + int c = fgetc(fp); + if (c == EOF) + break; + bytes_read++; + if (bytes_read > buf_len) { // room for terminator ensured later + buf_len *= 2; + buf = ch_realloc(buf, buf_len, false); + } + buf[bytes_read-1] = c; + if (c == delim) + break; + } + + if (buf[bytes_read-1] == delim) { // found delimiter + buf[bytes_read-1] = '\0'; + } else if (feof(fp)) { // end-of-file + if (bytes_read == 0) // no record left + return NULL; + else { // record ends at EOF (no delimiter) + if (bytes_read >= buf_len) { + T_ (bytes_read == buf_len); + buf = ch_realloc(buf, buf_len + 1, false); + } + buf[bytes_read] = '\n'; + } + } else { // error + Te (0, "error reading file"); // don’t know filename here + } + + return buf; +} + +/* Allocate and return a new buffer of length size bytes. The initial contents + of the buffer are undefined. + + If pointerful, then the buffer may contain pointers. Otherwise, the caller + guarantees no pointers will ever be stored in the buffer. This allows + garbage collection optimizations. If unsure, say true. */ +void *ch_malloc(size_t size, bool pointerful) +{ + void *buf; + +#ifdef HAVE_GC + #error +#else + (void)pointerful; // suppress warning + T_ (buf = malloc(size)); +#endif + + return buf; +} + +/* Initialize memory management. */ +void ch_memory_init() +{ + ch_memory_log("init"); +} + + +/* Log stack and heap memory usage, and GC statistics if enabled, to stderr + and syslog if enabled. */ +void ch_memory_log(const char *when) +{ + FILE *fp; + char *line = NULL; + ssize_t stack_len = 0, heap_len = 0; + char *text; + + /* Compute stack and heap size. While awkward, AFAICT this is the best + available way to get these sizes. See proc_pid_maps(5). + Whitespace-separated (?) fields: + + 1. start (inclusive) and end (exclusive) addresses, in hex + 2. permissions, e.g. “r-xp” + 3. offset, in hex + 4. device major:minor, in hex? + 5. inode number, in decimal + 6. pathname */ + T_ (fp = fopen("/proc/self/maps", "r")); + while ((line = ch_getdelim(fp, '\n'))) { + int conv_ct; + void *start, *end; + char path[8]; // must match literal in format string! + conv_ct = sscanf(line, "%p-%p %*[rwxp-] %*x %*x:%*x %*u %7s", + &start, &end, path); + if (conv_ct != 3) { + WARNING("please report this bug: can't parse map: %s", line); + break; + } + if (!strcmp(path, "[stack]")) + stack_len += end - start; + else if (!strcmp(path, "[heap]")) + heap_len += end - start; + } + Z_ (fclose(fp)); + + // log the basics + text = ch_asprintf("mem: %s: stack %zu kB %+zd, heap %zu kB %+zd", when, + stack_len / 1024, (stack_len - stack_prev) / 1024, + heap_len / 1024, (heap_len - heap_prev) / 1024); + VERBOSE(text); +#ifdef HAVE_SYSLOG + syslog(SYSLOG_PRI, "%s", text); +#endif + stack_prev = stack_len; + heap_prev = heap_len; + + // log GC stuff +#ifdef HAVE_GC + FIXME +#endif +} + +/* Change the size of allocated buffer p to size bytes. Like realloc(3), if p + is NULL, then this function is equivalent to ch_malloc(). Unlike free(3), + size may not be zero. + + If size is greater than the existing buffer length, the initial content of + new bytes is undefined. If size is less than the existing buffer length, + this function may be a no-op; i.e., it may be impossible to shrink a + buffer’s actual allocation. + + pointerful is as in ch_malloc(). If p is non-NULL, it must match the the + original allocation, though this is not validated. */ +void *ch_realloc(void *p, size_t size, bool pointerful) +{ + void *p_new; + +#ifdef HAVE_GC + #error +#else + (void)pointerful; // suppress warning + T_ (p_new = realloc(p, size)); +#endif + + return p_new; +} diff --git a/bin/mem.h b/bin/mem.h new file mode 100644 index 000000000..16b263c6c --- /dev/null +++ b/bin/mem.h @@ -0,0 +1,18 @@ +/* Memory management routines. */ + +#define _GNU_SOURCE +#pragma once + +#include +#include + +/** Function prototypes **/ + +char *ch_asprintf(const char *fmt, ...); +char *ch_getdelim(FILE *fp, char delim); +void ch_memory_init(); +void ch_memory_log(const char *when); +void *ch_malloc(size_t size, bool pointerful); +void *ch_realloc(void *p, size_t size, bool pointerful); +char *ch_strdup(const char *src); +void garbageinate(void); From 607e1b605cb78e12c85546c2745c0f010adb4a95 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 12 Sep 2024 16:30:01 -0600 Subject: [PATCH 24/29] basic memory functions [skip ci] --- bin/Makefile.am | 2 +- bin/ch-run.c | 9 ++++++--- bin/json.c | 7 ++++--- bin/json.h | 2 +- bin/mem.c | 32 ++++++++++++++++++-------------- bin/misc.h | 5 +++++ 6 files changed, 35 insertions(+), 22 deletions(-) diff --git a/bin/Makefile.am b/bin/Makefile.am index ac34038fa..c21766217 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -8,7 +8,7 @@ bin_PROGRAMS = ch-checkns ch-run ch_checkns_SOURCES = ch-checkns.c misc.h misc.c -ch_run_SOURCES = ch-run.c core.h core.c hook.h hook.c misc.h misc.c +ch_run_SOURCES = ch-run.c core.h core.c hook.h hook.c mem.h mem.c misc.h misc.c if HAVE_JSON ch_run_SOURCES += json.h json.c endif diff --git a/bin/ch-run.c b/bin/ch-run.c index 79996f97a..dc3f0b5c3 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -18,6 +18,7 @@ #ifdef HAVE_JSON #include "json.h" #endif +#include "mem.h" #include "misc.h" #ifdef HAVE_SECCOMP #include "seccomp.h" @@ -194,11 +195,12 @@ int main(int argc, char *argv[]) T_ (warnings != MAP_FAILED); privs_verify_invoking(); + ch_memory_init(); Z_ (atexit(warnings_reprint)); #ifdef ENABLE_SYSLOG - syslog(LOG_USER|LOG_INFO, "uid=%u args=%d: %s", getuid(), argc, + syslog(SYSLOG_PRI, "uid=%u args=%d: %s", getuid(), argc, argv_to_string(argv)); #endif @@ -256,6 +258,7 @@ int main(int argc, char *argv[]) Z_ (unsetenv("ARGP_HELP_FMT")); logging_init(args.log_color, args.log_test); + ch_memory_log("start"); if (arg_next >= argc - 1) { printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); @@ -427,12 +430,12 @@ void hooks_env_install(struct args *args) case ENV_CDI_DEV: name = "env-set-cdi"; f = hook_envs_set; - d = cdi_envs_get(arg); + //d = cdi_envs_get(arg); break; case ENV_CDI_ALL: name = "env-set-cdi-all"; f = hook_envs_set; - d = cdi_envs_get(NULL); + //d = cdi_envs_get(NULL); case ENV_END: T_ (false); // unreachable break; diff --git a/bin/json.c b/bin/json.c index 6e04dbafd..5b294ff4d 100644 --- a/bin/json.c +++ b/bin/json.c @@ -167,14 +167,15 @@ bool cdi_devid_kind_p(const char *devid) are newly allocated; the caller must free the list with envs_free(). */ struct env_var *cdi_envs_get(const char *devid) { - struct env_var *vars; + //struct env_var *vars; // count variables so we can do just one allocation - for () + //for () // set up the list - return vars; + //return vars; + return NULL; } /* Free spec. */ diff --git a/bin/json.h b/bin/json.h index 4b86a7660..4c425652c 100644 --- a/bin/json.h +++ b/bin/json.h @@ -27,5 +27,5 @@ struct cdi_config { /** Function prototypes **/ -void cdi_envs_get(const char *devid); +struct env_var *cdi_envs_get(const char *devid); void cdi_init(struct cdi_config *cf); diff --git a/bin/mem.c b/bin/mem.c index efdead799..68b0f6d40 100644 --- a/bin/mem.c +++ b/bin/mem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "config.h" #include "mem.h" @@ -76,17 +77,19 @@ ssize_t heap_prev = 0; be the fastest to format. */ char *ch_asprintf(const char *fmt, ...) { - va_list ap; + va_list ap1, ap2; int str_len; char *str; - va_start(ap, fmt); + va_start(ap1, fmt); + va_copy(ap2, ap1); - T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap))); + T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap1))); str = ch_malloc(str_len + 1, false); - T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap)); + T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap2)); - va_end(ap); + va_end(ap1); + va_end(ap2); return str; } @@ -164,13 +167,13 @@ void *ch_malloc(size_t size, bool pointerful) return buf; } -/* Initialize memory management. */ -void ch_memory_init() +/* Initialize memory management. + + We don’t log usage here because it’s called before logging is up. */ +void ch_memory_init(void) { - ch_memory_log("init"); } - /* Log stack and heap memory usage, and GC statistics if enabled, to stderr and syslog if enabled. */ void ch_memory_log(const char *when) @@ -194,11 +197,12 @@ void ch_memory_log(const char *when) while ((line = ch_getdelim(fp, '\n'))) { int conv_ct; void *start, *end; - char path[8]; // must match literal in format string! + char path[8] = { 0 }; // length must match format string! conv_ct = sscanf(line, "%p-%p %*[rwxp-] %*x %*x:%*x %*u %7s", &start, &end, path); - if (conv_ct != 3) { - WARNING("please report this bug: can't parse map: %s", line); + if (conv_ct < 2) { // will be 2 if path empty + WARNING("please report this bug: can't parse map: %d: \"%s\"", + conv_ct, line); break; } if (!strcmp(path, "[stack]")) @@ -209,11 +213,11 @@ void ch_memory_log(const char *when) Z_ (fclose(fp)); // log the basics - text = ch_asprintf("mem: %s: stack %zu kB %+zd, heap %zu kB %+zd", when, + text = ch_asprintf("mem: %s: stack %zd kB %+zd, heap %zd kB %+zd", when, stack_len / 1024, (stack_len - stack_prev) / 1024, heap_len / 1024, (heap_len - heap_prev) / 1024); VERBOSE(text); -#ifdef HAVE_SYSLOG +#ifdef ENABLE_SYSLOG syslog(SYSLOG_PRI, "%s", text); #endif stack_prev = stack_len; diff --git a/bin/misc.h b/bin/misc.h index aec855943..6d9dce82b 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -23,6 +23,11 @@ and hopefully others support the following extension. */ #define noreturn __attribute__ ((noreturn)) +/* Syslog facility and level we use. */ +#ifdef ENABLE_SYSLOG +#define SYSLOG_PRI (LOG_USER|LOG_INFO) +#endif + /* Size of “warnings” buffer, in bytes. We want this to be big enough that we don’t need to worry about running out of room. */ #define WARNINGS_SIZE (4*1024) From 04243a907c12b6eb9194c54d39cb13254dd239ff Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Thu, 12 Sep 2024 16:41:14 -0600 Subject: [PATCH 25/29] log memory usage on exit [skip ci] --- bin/ch-run.c | 2 ++ bin/mem.c | 5 +++++ bin/mem.h | 3 ++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index dc3f0b5c3..33fb871fe 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -197,6 +197,8 @@ int main(int argc, char *argv[]) privs_verify_invoking(); ch_memory_init(); + // note: exit functions not called on fatal error if --abort-on-fatal + Z_ (atexit(ch_memory_log_exit)); Z_ (atexit(warnings_reprint)); #ifdef ENABLE_SYSLOG diff --git a/bin/mem.c b/bin/mem.c index 68b0f6d40..2385392e7 100644 --- a/bin/mem.c +++ b/bin/mem.c @@ -229,6 +229,11 @@ void ch_memory_log(const char *when) #endif } +void ch_memory_log_exit(void) +{ + ch_memory_log("exit"); +} + /* Change the size of allocated buffer p to size bytes. Like realloc(3), if p is NULL, then this function is equivalent to ch_malloc(). Unlike free(3), size may not be zero. diff --git a/bin/mem.h b/bin/mem.h index 16b263c6c..6e598fa4c 100644 --- a/bin/mem.h +++ b/bin/mem.h @@ -10,8 +10,9 @@ char *ch_asprintf(const char *fmt, ...); char *ch_getdelim(FILE *fp, char delim); -void ch_memory_init(); +void ch_memory_init(void); void ch_memory_log(const char *when); +void ch_memory_log_exit(void); void *ch_malloc(size_t size, bool pointerful); void *ch_realloc(void *p, size_t size, bool pointerful); char *ch_strdup(const char *src); From 601fc07094774db97448223c3c47bf69b7d90344 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 13 Sep 2024 14:45:58 -0600 Subject: [PATCH 26/29] add actual libgc [skip ci] --- bin/ch-run.c | 2 +- bin/core.c | 2 + bin/json.c | 16 +++-- bin/json.h | 2 +- bin/mem.c | 171 ++++++++++++++++++++++++++++++++++++++++++--------- bin/mem.h | 2 +- configure.ac | 123 +++++++++++++++++++++++------------- 7 files changed, 234 insertions(+), 84 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 33fb871fe..40dc67240 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -198,7 +198,7 @@ int main(int argc, char *argv[]) ch_memory_init(); // note: exit functions not called on fatal error if --abort-on-fatal - Z_ (atexit(ch_memory_log_exit)); + Z_ (atexit(ch_memory_exit)); Z_ (atexit(warnings_reprint)); #ifdef ENABLE_SYSLOG diff --git a/bin/core.c b/bin/core.c index 023ca0591..74c3e89be 100644 --- a/bin/core.c +++ b/bin/core.c @@ -19,6 +19,7 @@ #include #include +#include "mem.h" #include "misc.h" #include "core.h" #ifdef HAVE_LIBSQUASHFUSE @@ -633,6 +634,7 @@ void run_user_command(char *argv[], const char *initial_dir) T_ (freopen("/dev/null", "w", stdout)); if (verbose < LL_STDERR) T_ (freopen("/dev/null", "w", stderr)); + ch_memory_log("usrx"); execvp(argv[0], argv); // only returns if error Tf (0, "can't execve(2): %s", argv[0]); } diff --git a/bin/json.c b/bin/json.c index 5b294ff4d..d0058658e 100644 --- a/bin/json.c +++ b/bin/json.c @@ -59,14 +59,6 @@ struct json_dispatch { // Block size in bytes for reading JSON files. const size_t READ_SZ = 16384; -/** Globals **/ - -// List of CDI specs we’ve read. Yes it’s a global, but that lets us keep -// struct cdi_spec private to this file, which seemed like the right -// trade-off. It also seemed like “all the specs we know about” wasn’t -// something we needed multiple of. -struct cdi_spec *cdi_specs = NULL; - /** Function prototypes (private) **/ @@ -89,7 +81,13 @@ void cdiPC_hook(cJSON *tree, struct cdi_spec *spec); void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); -/** Global variables **/ +/** Globals **/ + +// List of CDI specs we’ve read. Yes it’s a global, but that lets us keep +// struct cdi_spec private to this file, which seemed like the right +// trade-off. It also seemed like “all the specs we know about” wasn’t +// something we needed multiple of. +struct cdi_spec *cdi_specs = NULL; /* Callback tables. In the struct, the callback’s second argument is “void *” so any state object can be provided. However, we’d prefer the actual diff --git a/bin/json.h b/bin/json.h index 4c425652c..2de94d1d9 100644 --- a/bin/json.h +++ b/bin/json.h @@ -5,10 +5,10 @@ #define _GNU_SOURCE #pragma once +#include "config.h" #include -#include "config.h" #include "core.h" #include "misc.h" diff --git a/bin/mem.c b/bin/mem.c index 2385392e7..1e576609d 100644 --- a/bin/mem.c +++ b/bin/mem.c @@ -1,4 +1,20 @@ -/* Re. zeroing newly-allocated memory: +/* libgc API + --------- + + See: + + https://hboehm.info/gc/gcinterface.html + https://github.com/ivmai/bdwgc/blob/57ccbcc/include/gc/gc.h#L459 + + The latter is more complete. + + libgc provides both upper-case, e.g. GC_MALLOC(), and lower-case, e.g. + GC_malloc(), versions of many functions. It’s not totally clear to me what + the separation principles are, though the vibe does seem to prefer the + upper-case versions. We use the upper-case when available. + + Zeroing newly-allocated memory + ------------------------------ Because we use a lot of zero-terminated data structures, it would be nice for the allocation functions to return zeroed buffers. We also want to not @@ -34,13 +50,17 @@ [1]: https://stackoverflow.com/questions/1281686 */ #define _GNU_SOURCE +#include "config.h" #include #include #include #include -#include "config.h" +#ifdef HAVE_GC +#include +#endif + #include "mem.h" #include "misc.h" @@ -51,16 +71,53 @@ /** Constants **/ +/** Function prototytpes (private) **/ + +ssize_t kB(ssize_t byte_ct); + + /** Globals **/ -/* Size of the stack and heap at previous ch_memory_log() call. These are - signed to avoid subtraction gotchas. */ +/* Note: All the memory statistics are signed “ssize_t” rather than the more + correct unsigned “size_t” so that subtractions are less error-prone (we + report lots of differences). We assume that memory usage is small enough + for this to not matter. */ + +/* Size of the stack, heap, and anonymous mmap(2) mappings at previous + ch_memory_log() call. */ ssize_t stack_prev = 0; ssize_t heap_prev = 0; +ssize_t anon_prev = 0; + +#ifdef HAVE_GC +/* Note: The first four counters are from GC_prof_stats_s fields and have the + corresponding names. Total size of allocated blocks is derived. See gc.h. */ -/** Functions **/ +/* Total size of the heap. This includes “unmapped” bytes that libgc is + tracking but has given back to the OS, I assume to be re-requested from the + OS if needed. */ +ssize_t heapsize_prev = 0; + +/* Free bytes in the heap, both mapped and unmapped. */ +ssize_t free_prev = 0; +/* Unmapped bytes (i.e., returned to the OS but still tracked by libgc) in the + heap. */ +ssize_t unmapped_prev = 0; + +/* Number of garbage collections done so far. */ +ssize_t gc_no_prev = 0; + +/* Total time spent doing garbage collection, in milliseconds. Corresponds to + GC_get_full_gc_total_time(). Note that because ch-run is single-threaded, + we do not report time spent collecting with the world stopped. */ +long time_collecting_prev = 0; + +#endif + + +/** Functions **/ /* Return a snprintf(3)-formatted string in a newly allocated buffer of appropriate length. Exit on error. @@ -158,20 +215,31 @@ void *ch_malloc(size_t size, bool pointerful) void *buf; #ifdef HAVE_GC - #error + buf = pointerful ? GC_MALLOC(size) : GC_MALLOC_ATOMIC(size); #else (void)pointerful; // suppress warning - T_ (buf = malloc(size)); + buf = malloc(size); #endif + T_ (buf); return buf; } -/* Initialize memory management. +/* Shut down memory management. */ +void ch_memory_exit(void) +{ + ch_memory_log("exit"); +} - We don’t log usage here because it’s called before logging is up. */ +/* Initialize memory management. We don’t log usage here because it’s called + before logging is up. */ void ch_memory_init(void) { +#ifdef HAVE_GC + //GC_set_handle_fork(1); // I think the default mode is fine??? + GC_INIT(); + GC_start_performance_measurement(); +#endif } /* Log stack and heap memory usage, and GC statistics if enabled, to stderr @@ -180,11 +248,16 @@ void ch_memory_log(const char *when) { FILE *fp; char *line = NULL; - ssize_t stack_len = 0, heap_len = 0; - char *text; + char *s; + ssize_t stack_len = 0, heap_len = 0, anon_len = 0; +#ifdef HAVE_GC + struct GC_prof_stats_s ps; + ssize_t alloc, alloc_prev; + long time_collecting; +#endif - /* Compute stack and heap size. While awkward, AFAICT this is the best - available way to get these sizes. See proc_pid_maps(5). + /* Compute stack, heap, and anonymous mapping sizes. While awkward, AFAICT + this is the best available way to get these sizes. See proc_pid_maps(5). Whitespace-separated (?) fields: 1. start (inclusive) and end (exclusive) addresses, in hex @@ -205,7 +278,9 @@ void ch_memory_log(const char *when) conv_ct, line); break; } - if (!strcmp(path, "[stack]")) + if (strlen(path) == 0) + anon_len += end - start; + else if (!strcmp(path, "[stack]")) stack_len += end - start; else if (!strcmp(path, "[heap]")) heap_len += end - start; @@ -213,25 +288,52 @@ void ch_memory_log(const char *when) Z_ (fclose(fp)); // log the basics - text = ch_asprintf("mem: %s: stack %zd kB %+zd, heap %zd kB %+zd", when, - stack_len / 1024, (stack_len - stack_prev) / 1024, - heap_len / 1024, (heap_len - heap_prev) / 1024); - VERBOSE(text); + s = ch_asprintf("mem: %s: " + "stac %zdkB %+zd, heap %zdkB %+zd, anon %zdkB %+zd", + when, + kB(stack_len), kB(stack_len - stack_prev), + kB(heap_len), kB(heap_len - heap_prev), + kB(anon_len), kB(anon_len - anon_prev)); + DEBUG(s); #ifdef ENABLE_SYSLOG - syslog(SYSLOG_PRI, "%s", text); + syslog(SYSLOG_PRI, "%s", s); #endif stack_prev = stack_len; heap_prev = heap_len; + anon_prev = anon_len; // log GC stuff #ifdef HAVE_GC - FIXME + GC_get_prof_stats(&ps, sizeof(ps)); + time_collecting = GC_get_full_gc_total_time(); + alloc = ps.heapsize_full - ps.free_bytes_full; + alloc_prev = heapsize_prev - free_prev; + s = ch_asprintf("gc: " + "%s: %ld collections (%+ld) in %zdms (%+zd)", + when, + ps.gc_no, ps.gc_no - gc_no_prev, + time_collecting, time_collecting - time_collecting_prev); + DEBUG(s); +#ifdef ENABLE_SYSLOG + syslog(SYSLOG_PRI, "%s", s); +#endif + gc_no_prev = ps.gc_no; + time_collecting_prev = time_collecting; + s = ch_asprintf("gc: %s: " + "totl %zdkB %+zd, allc %zdkB %+zd, free %zdkB %+zd, unmp %zdkB %+zd", + when, + kB(ps.heapsize_full), kB(ps.heapsize_full - heapsize_prev), + kB(alloc), kB(alloc - alloc_prev), + kB(ps.free_bytes_full), kB(ps.free_bytes_full - free_prev), + kB(ps.unmapped_bytes), kB(ps.unmapped_bytes - unmapped_prev)); + DEBUG(s); +#ifdef ENABLE_SYSLOG + syslog(SYSLOG_PRI, "%s", s); +#endif + heapsize_prev = ps.heapsize_full; + free_prev = ps.free_bytes_full; + unmapped_prev = ps.unmapped_bytes; #endif -} - -void ch_memory_log_exit(void) -{ - ch_memory_log("exit"); } /* Change the size of allocated buffer p to size bytes. Like realloc(3), if p @@ -249,12 +351,25 @@ void *ch_realloc(void *p, size_t size, bool pointerful) { void *p_new; + T_ (size > 0); + + if (p == NULL) + p_new = ch_malloc(size, pointerful); // no GC_REALLOC_ATOMIC() + else { #ifdef HAVE_GC - #error + p_new = GC_REALLOC(p, size); #else - (void)pointerful; // suppress warning - T_ (p_new = realloc(p, size)); + p_new = realloc(p, size); #endif + } + T_ (p_new); return p_new; } + +/* Convert a signed number of bytes to kilobytes (truncated) and return it. */ +ssize_t kB(ssize_t byte_ct) +{ + return byte_ct / 1024; +} + diff --git a/bin/mem.h b/bin/mem.h index 6e598fa4c..a6dcd66b2 100644 --- a/bin/mem.h +++ b/bin/mem.h @@ -10,9 +10,9 @@ char *ch_asprintf(const char *fmt, ...); char *ch_getdelim(FILE *fp, char delim); +void ch_memory_exit(void); void ch_memory_init(void); void ch_memory_log(const char *when); -void ch_memory_log_exit(void); void *ch_malloc(size_t size, bool pointerful); void *ch_realloc(void *p, size_t size, bool pointerful); char *ch_strdup(const char *src); diff --git a/configure.ac b/configure.ac index 0f5990d06..e9d24cdcd 100644 --- a/configure.ac +++ b/configure.ac @@ -188,6 +188,7 @@ AC_ARG_ENABLE([test], AS_HELP_STRING([--disable-test], [test suite]), [], [enable_test=yes]) +# --with-seccomp AC_ARG_WITH([seccomp], AS_HELP_STRING([--with-seccomp=(yes|no)], [support for --seccomp])) @@ -207,6 +208,38 @@ AS_CASE([$with_seccomp], [*], # anything else [AC_MSG_ERROR([invalid --with-seccomp arg: $with_seccomp])]) +# --with-gc +AC_ARG_WITH([gc], + AS_HELP_STRING([--with-gc=@<:@yes|no@:>@], + [enable conservative garbage collection with libgc])) +AS_CASE([$with_gc], + [yes], + [want_gc=yes + need_gc=yes], + [no], + [want_gc=no + need_gc=no], + [''], + [want_gc=yes + need_gc=no], + [*], + [AC_MSG_ERROR([--with-gc: bad argument: $with_gc])]) + +AC_ARG_WITH([gc-include], + AS_HELP_STRING([--with-gc-include=DIR], + [directory containing gc.h (if not in defaults)])) +AS_IF([test -n "$with_gc_include"], + [inc_libgc=$with_gc_include + CH_REQUIRE_DIR([$inc_gc], [--with-gc-include])]) + +AC_ARG_WITH([gc-lib], + AS_HELP_STRING([--with-gc-lib=DIR], + [directory containing libgc.so (if not in defaults)])) +AS_IF([test -n "$with_gc_lib"], + [lib_json=$with_gc_lib + CH_REQUIRE_DIR([$lib_gc], [--with-gc])]) + +# --with-json AC_ARG_WITH([json], AS_HELP_STRING([--with-json=@<:@yes|no@:>@], [enable JSON features by linking with libcjson])) @@ -237,6 +270,7 @@ AS_IF([test -n "$with_json_lib"], [lib_json=$with_json_lib CH_REQUIRE_DIR([$lib_json], [--with-json-lib])]) +# --with-libsquashfuse AC_ARG_WITH([libsquashfuse], AS_HELP_STRING([--with-libsquashfuse=@<:@yes|no|PATH@:>@], [whether to link with libsquashfuse])) @@ -284,6 +318,8 @@ AC_PROG_CC ch_cflags='-std=c99 -Wall -Wno-unused-command-line-argument' AS_IF([test $use_werror = yes], [ch_cflags="$ch_cflags -Werror"]) +AS_IF([test -n "$inc_gc"], # -L$lib_gc added below + [ch_cflags="$ch_cflags -I$inc_gc"]) AS_IF([test -n "$inc_json"], # -L$lib_json added below [ch_cflags="$ch_cflags -I$inc_json"]) AS_IF([test -n "$lib_libsquashfuse"], @@ -306,30 +342,6 @@ AS_IF([test "$CC" = icc], # things to LIBS, which we don’t want because it’s applied to all executables. CH_RUN_LIBS= -# asprintf(3) -# -# You can do this with AC_CHECK_FUNC or AC_CHECK_FUNCS, but those macros call -# the function with no arguments. This causes a warning for asprintf() for -# some compilers (and I have no clue why others accept it); see issue #798. -# Instead, try to build a small test program that calls asprintf() correctly. -AC_MSG_CHECKING([for asprintf in libc]) -AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - #define _GNU_SOURCE - #include - #include - - int main(void) - { - char *p; - if (asprintf(&p, "WEIRD AL YANKOVIC\n") >= 0) - free(p); - return 0; - } - ]])], - [AC_MSG_RESULT([yes])], - [AC_MSG_RESULT([no]) - AC_MSG_ERROR([asprintf(3) not found; please report this bug])]) - # argp_parse(3), which is included with glibc but not other libc’s, e.g. musl. AC_MSG_CHECKING([for argp_parse in libc]) AC_LINK_IFELSE([AC_LANG_SOURCE([[ @@ -473,15 +485,29 @@ AC_CHECK_DECL(FNM_EXTMATCH, [[#define _GNU_SOURCE #include ]]) -# GNU extension type for the comparison function argument to qsort(3). -AC_CHECK_TYPE(comparison_fn_t, - [have_comparison_fn_t=yes], - [have_comparison_fn_t=no], - [[#define _GNU_SOURCE - #include ]]) - -# cJSON. Note that we don’t try to ensure the header we find matches the +# libgc. Note that we don’t try to ensure the header we find matches the # library we find. Hopefully that’s not a problem. +AS_IF([test $want_gc = yes], [ + AC_CHECK_LIB(gc, GC_malloc, + [have_libgc=yes + AS_IF([test -n "$lib_gc"], + [CH_RUN_LIBS="-Wl,-rpath=$lib_gc $CH_RUN_LIBS"]) + CH_RUN_LIBS="-lgc $CH_RUN_LIBS"], + [have_libgc=no], + [$CH_RUN_LIBS]) + AC_CHECK_HEADER([gc.h], + [have_gc_h=yes], + [have_gc_h=no]) +], [have_libgc=no + have_gc_h=no]) +# Error out if needed but not found. +AS_IF([test $have_libgc = yes && test $have_gc_h = yes], + [have_gc=yes], + [have_gc=no]) +AS_IF([test $need_gc = yes && test $have_gc = no], + [AC_MSG_ERROR([--with-gc=yes but libgc not found])]) + +# cJSON. Also do not check this header matches the library we find. AS_IF([test $want_json = yes], [ AC_CHECK_LIB(cjson, cJSON_ParseWithLength, [have_libcjson=yes @@ -866,10 +892,10 @@ AC_SUBST([CH_RUN_LIBS]) AC_SUBST([PYTHON_SHEBANG]) AC_SUBST([SPHINX]) -AS_IF([test $have_comparison_fn_t = yes], - [AC_DEFINE([HAVE_COMPARISON_FN_T], [1], [comp. function for qsort(3)])]) AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) +AS_IF([test $have_gc = yes], + [AC_DEFINE([HAVE_GC], [1], [enable garbage collection])]) AM_CONDITIONAL([HAVE_JSON], [test $have_json = yes]) AS_IF([test $have_json = yes], [AC_DEFINE([HAVE_JSON], [1], [enable JSON features]) @@ -897,13 +923,19 @@ AS_IF([ test $have_userns = yes], [have_ch_run=yes], [have_ch_run=no]) +AS_IF([ test $want_gc = yes], + [note_libgc=$have_libgc + note_gc_h=$have_gc_h], + [note_libgc='not tested' + note_gc_h='not tested']) + AS_IF([ test $want_json = yes], - [libcjson_note=$have_libcjson + [note_libcjson=$have_libcjson AS_IF([test $have_cjson_h = yes], - [cjson_h_note="yes, $cjson_h"], - [cjson_h_note=no])], - [libcjson_note='not tested' - cjson_h_note='not tested']) + [note_cjson_h="yes, $cjson_h"], + [note_cjson_h=no])], + [note_libcjson='not tested' + note_cjson_h='not tested']) # image builders @@ -1044,14 +1076,17 @@ Building Charliecloud \$CFLAGS ... ${CFLAGS} ch-run(1) library args ... ${CH_RUN_LIBS} - optional: - extended glob patterns in --unset-env ... ${have_fnm_extmatch} - comparison_fn_t from libc ... ${have_comparison_fn_t} + extended glob patterns in --unset-env: ${have_fnm_extmatch} + + garbage collection: ${have_gc} + enabled ... ${want_gc} + libgc ... ${note_libgc} + gc.h ... ${note_gc_h} JSON features: ${have_json} enabled ... ${want_json} - libcjson ... ${libcjson_note} - cJSON.h ... ${cjson_h_note} + libcjson ... ${note_libcjson} + cJSON.h ... ${note_cjson_h} ch-run(1) internal SquashFS mounting: ${have_libsquashfuse} enabled ... ${want_libsquashfuse} From 83f526056ef041df075caab09382c58cf4452e90 Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Fri, 20 Sep 2024 16:40:24 -0600 Subject: [PATCH 27/29] convert memory management to our wrappers [skip ci] --- bin/ch-run.c | 77 +++++----- bin/core.c | 44 ++---- bin/core.h | 2 +- bin/fuse.c | 10 +- bin/hook.c | 20 +-- bin/json.c | 85 +++++----- bin/json.h | 1 + bin/mem.c | 101 ++++++++++-- bin/mem.h | 8 +- bin/misc.c | 419 +++++++++++++++++++++----------------------------- bin/misc.h | 9 +- bin/seccomp.c | 5 +- 12 files changed, 377 insertions(+), 404 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index 40dc67240..332180d15 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -4,6 +4,8 @@ are modest and the program is short-lived. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -12,7 +14,6 @@ #include #include -#include "config.h" #include "core.h" #include "hook.h" #ifdef HAVE_JSON @@ -202,8 +203,8 @@ int main(int argc, char *argv[]) Z_ (atexit(warnings_reprint)); #ifdef ENABLE_SYSLOG - syslog(SYSLOG_PRI, "uid=%u args=%d: %s", getuid(), argc, - argv_to_string(argv)); + syslog(SYSLOG_PRI, "uid=%u args=%d: %s", + getuid(), argc, argv_to_string(argv)); #endif username = getenv("USER"); @@ -260,10 +261,14 @@ int main(int argc, char *argv[]) Z_ (unsetenv("ARGP_HELP_FMT")); logging_init(args.log_color, args.log_test); - ch_memory_log("start"); + ch_memory_log("init"); +#ifdef HAVE_JSON + json_init(); +#endif + if (arg_next >= argc - 1) { - printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); + fprintf(stderr, "usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); FATAL("IMAGE and/or COMMAND not specified"); } args.c.img_ref = argv[arg_next++]; @@ -352,27 +357,23 @@ bool get_first_env(char **array, char **name, char **value) environment changes. d must be NULL. */ void hook_envs_def_first(struct container *c, void *d) { - char *vnew, *vold; + char *vold; T_ (d == NULL); // $HOME: If --home, set to “/home/$USER”. - if (c->host_home) { - vnew = cat("/home/", username); - env_set("HOME", vnew, false); - free(vnew); - } else if (path_exists("/root", NULL, true)) { + if (c->host_home) + env_set("HOME", cat("/home/", username), false); + else if (path_exists("/root", NULL, true)) env_set("HOME", "/root", false); - } else + else env_set("HOME", "/", false); // $PATH: Append /bin if not already present. vold = getenv("PATH"); - if (vold == NULL) { + if (vold == NULL) WARNING("$PATH not set"); - } else if (strstr(vold, "/bin") != vold && !strstr(vold, ":/bin")) { - T_ (1 <= asprintf(&vnew, "%s:/bin", vold)); - env_set("PATH", vnew, false); - } + else if (strstr(vold, "/bin") != vold && !strstr(vold, ":/bin")) + env_set("PATH", cat(vold, ":/bin"), false); // $TMPDIR: Unset. Z_ (unsetenv("TMPDIR")); @@ -407,7 +408,7 @@ void hooks_env_install(struct args *args) struct env_file *ef; name = "env-set-gfile"; f = hook_envs_set_file; - T_ (ef = malloc(sizeof(struct env_file))); + ef = ch_malloc(sizeof(struct env_file)); ef->path = arg; ef->delim = delim; ef->expand = args->c.env_expand; @@ -502,7 +503,7 @@ char *join_tag(char *cli_tag) } VERBOSE("join: peer group tag from getppid(2)"); - T_ (1 <= asprintf(&tag, "%d", getppid())); + tag = ch_asprintf("%d", getppid()); end: Te(tag[0] != '\0', "join: peer group tag cannot be empty string"); @@ -659,24 +660,22 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) abort_fatal = true; // in misc.c break; case 'b': { // --bind - char *src, *dst; - for (i = 0; args->c.binds[i].src != NULL; i++) // count existing binds - ; - T_ (args->c.binds = realloc(args->c.binds, - (i+2) * sizeof(struct bind))); - args->c.binds[i+1].src = NULL; // terminating zero - args->c.binds[i].dep = BD_MAKE_DST; - // source - src = strsep(&arg, ":"); - T_ (src != NULL); - Te (src[0] != 0, "--bind: no source provided"); - args->c.binds[i].src = src; - // destination - dst = arg ? arg : src; - Te (dst[0] != 0, "--bind: no destination provided"); - Te (strcmp(dst, "/"), "--bind: destination can't be /"); - Te (dst[0] == '/', "--bind: destination must be absolute"); - args->c.binds[i].dst = dst; + char *src, *dst; + i = list_count(args->c.binds, sizeof(args->c.binds[0])); + args->c.binds = ch_realloc(args->c.binds, (i+2) * sizeof(struct bind)); + memset(&args->c.binds[i+1], 0, sizeof(args->c.binds[0])); // terminate + args->c.binds[i].dep = BD_MAKE_DST; + // source + src = strsep(&arg, ":"); + T_ (src != NULL); + Te (src[0] != 0, "--bind: no source provided"); + args->c.binds[i].src = src; + // destination + dst = arg ? arg : src; + Te (dst[0] != 0, "--bind: no destination provided"); + Te (strcmp(dst, "/"), "--bind: destination can't be /"); + Te (dst[0] == '/', "--bind: destination must be absolute"); + args->c.binds[i].dst = dst; } break; case 'c': // --cd @@ -787,13 +786,13 @@ void privs_verify_invoking() T_ (euid == ruid && euid == suid); // no setuid or funny business } -/* Return path to the storage directory, if -s is not specified. */ +/* Return default path to the storage directory. */ char *storage_default(void) { char *storage = getenv("CH_IMAGE_STORAGE"); if (storage == NULL) - T_ (1 <= asprintf(&storage, "/var/tmp/%s.ch", username)); + storage = ch_asprintf("/var/tmp/%s.ch", username)); return storage; } diff --git a/bin/core.c b/bin/core.c index 74c3e89be..29f383b2c 100644 --- a/bin/core.c +++ b/bin/core.c @@ -206,12 +206,7 @@ void containerize(struct container *c) HOOK_DUP_OK add the hook anyway HOOK_DUP_SKIP silently do nothing (i.e., don’t add the hook) - HOOK_DUP_FAIL fatal error - - Warning: The hook framework does no memory management for name or d, i.e., - if name needs to be freed, that is the responsibility of the caller (this - function uses a copy), and/or if anything in d either needs to be freed, - that is the responsibility of the hook. */ + HOOK_DUP_FAIL fatal error */ void hook_add(struct hook **hook_list, enum hook_dup dup, const char *name, hookf_t *f, void *d) { @@ -232,26 +227,24 @@ void hook_add(struct hook **hook_list, enum hook_dup dup, } } - T_ (h.name = strdup(name)); + h.name = name; h.f = f; h.data = d; list_append((void **)hook_list, &h, sizeof(h)); } -/* Run hooks in hook_list, passing c, then deallocate the list and set - *hook_list to NULL. hook_list must be a member of c. */ +/* Run hooks in hook_list, passing c, then set *hook_list to NULL. hook_list + must be a member of c. */ void hooks_run(struct container *c, struct hook **hook_list) { int hook_ct = list_count(*hook_list, sizeof((*hook_list)[0])); - for (int i = 0; (*hook_list)[i].f != NULL; i++) { + for (int i = 0; i < hook_ct; i++) { struct hook h = (*hook_list)[i]; DEBUG("calling hook %d/%d: %s", i+1, hook_ct, h.name); h.f(c, h.data); - free(h.name); } - free(*hook_list); *hook_list = NULL; } @@ -297,15 +290,12 @@ enum img_type image_type(const char *ref, const char *storage_dir) char *img_name2path(const char *name, const char *storage_dir) { char *path; - char *name_fs = strdup(name); + char *name_fs = ch_strdup(name); replace_char(name_fs, '/', '%'); replace_char(name_fs, ':', '+'); - T_ (1 <= asprintf(&path, "%s/img/%s", storage_dir, name_fs)); - - free(name_fs); // make Tim happy - return path; + return path_join(storage_dir, path_join("img", name_fs)); } /* Begin coordinated section of namespace joining. */ @@ -405,18 +395,17 @@ void mounts_setup(struct container *c) char *options; struct stat st; VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); - T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); + options = cat("size=", c->overlay_size); Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), "cannot mount tmpfs for overlay"); - free(options); Z_ (mkdir(WF_MNT "/upper", 0700)); Z_ (mkdir(WF_MNT "/work", 0700)); Z_ (mkdir(WF_MNT "/merged", 0700)); mkdir_scratch = WF_MNT "/mkdir_overmount"; Z_ (mkdir(mkdir_scratch, 0700)); - T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s," - "index=on,userxattr,volatile"), - c->newroot, WF_MNT "/upper", WF_MNT "/work")); + options = ch_asprintf(("lowerdir=%s,upperdir=%s,workdir=%s," + "index=on,userxattr,volatile"), + c->newroot, WF_MNT "/upper", WF_MNT "/work")); // update newroot Zf (stat(c->newroot, &st), "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); @@ -424,7 +413,6 @@ void mounts_setup(struct container *c) Zf (mount(NULL, c->newroot, "overlay", 0, options), "can't overlay: %s, %s", c->newroot, options); VERBOSE("newroot updated: %s", c->newroot); - free(options); } DEBUG("starting bind-mounts"); // Bind-mount default files and directories. @@ -446,8 +434,6 @@ void mounts_setup(struct container *c) } // Bind-mount user-specified directories. bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); - - free(nr_parent); } /* Join a specific namespace. */ @@ -456,7 +442,7 @@ void namespace_join(pid_t pid, const char *ns) char *path; int fd; - T_ (1 <= asprintf(&path, "/proc/%d/ns/%s", pid, ns)); + path = ch_asprintf(&path, "/proc/%d/ns/%s", pid, ns); fd = open(path, O_RDONLY); if (fd == -1) { if (errno == ENOENT) { @@ -545,7 +531,7 @@ void passwd_setup(const struct container *c) struct passwd *p; // /etc/passwd - T_ (path = cat(host_tmp, "/ch-run_passwd.XXXXXX")); + path = cat(host_tmp, "/ch-run_passwd.XXXXXX"); T_ (-1 != (fd = mkstemp(path))); // mkstemp(3) writes path if (c->container_uid != 0) T_ (1 <= dprintf(fd, "root:x:0:0:root:/root:/bin/sh\n")); @@ -570,7 +556,7 @@ void passwd_setup(const struct container *c) Z_ (unlink(path)); // /etc/group - T_ (path = cat(host_tmp, "/ch-run_group.XXXXXX")); + path = cat(host_tmp, "/ch-run_group.XXXXXX"); T_ (-1 != (fd = mkstemp(path))); if (c->container_gid != 0) T_ (1 <= dprintf(fd, "root:x:0:\n")); @@ -615,8 +601,6 @@ void pivot(struct container *c) "can't pivot_root(2)"); Zf (chroot("."), "can't chroot(2) into new root"); Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); - free(nr_parent); - free(nr_base); } /* Replace the current process with user command and arguments. */ diff --git a/bin/core.h b/bin/core.h index ac825babe..8615629fc 100644 --- a/bin/core.h +++ b/bin/core.h @@ -26,7 +26,7 @@ struct bind { struct container; // forward declaration to avoid definition loop typedef void (hookf_t)(struct container *, void *); struct hook { - char *name; + const char *name; hookf_t *f; void *data; }; diff --git a/bin/fuse.c b/bin/fuse.c index 164b8b16e..0d0186379 100644 --- a/bin/fuse.c +++ b/bin/fuse.c @@ -35,7 +35,7 @@ // Now we can include ll.h. #include -#include "config.h" +#include "config.h" // here to avoid potential clash with SquashFUSE config.h #include "core.h" #include "fuse.h" #include "misc.h" @@ -121,8 +121,7 @@ void sq_fork(struct container *c) // Default mount point? if (c->newroot == NULL) { - char *subdir; - T_ (asprintf(&subdir, "/%s.ch/mnt", username) > 0); + char *subdir = asprintf("/%s.ch/mnt", username); c->newroot = cat("/var/tmp", subdir); VERBOSE("using default mount point: %s", c->newroot); mkdirs("/var/tmp", subdir, NULL, NULL); @@ -141,8 +140,7 @@ void sq_fork(struct container *c) // Now that the filesystem is mounted, we can fork without race condition. // The child returns to caller and runs the user command. When that exits, // the parent gets SIGCHLD. - pid_child = fork(); - Tf (pid_child >= 0, "can't fork"); + pid_child = ch_fork(); if (pid_child > 0) // parent (child does nothing here) exit(sq_loop()); } @@ -229,7 +227,7 @@ void sq_mount(const char *img_path, char *mountpt) struct fuse_args mount_args = FUSE_ARGS_INIT(mount_argc, mount_argv); sq.mountpt = mountpt; - T_ (sq.chan = malloc(sizeof(sqfs_ll_chan))); + sq.chan = ch_malloc(sizeof(sqfs_ll_chan)); sq.ll = sqfs_ll_open(img_path, 0); Te (sq.ll != NULL, "can't open SquashFS: %s; try ch-run -vv?", img_path); diff --git a/bin/hook.c b/bin/hook.c index 1966e222a..f65cac869 100644 --- a/bin/hook.c +++ b/bin/hook.c @@ -1,8 +1,9 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE -#include +#include "config.h" +#include #include "core.h" #include "hook.h" @@ -14,30 +15,21 @@ /** Functions **/ -/* Set the environment variables listed in d, then free d. */ +/* Set the environment variables listed in d. */ void hook_envs_set(struct container *c, void *d) { struct env_var *vars = d; - envs_set(vars, c->env_expand); - envs_free(&vars); } -/* Set the environment variables specified in file d, then free d. NOTE: - d->path is still owned by hook_envs_install()’s caller, so we do not free - that buffer. */ +/* Set the environment variables specified in file d. */ void hook_envs_set_file(struct container *c, void *d) { struct env_file *ef = d; - struct env_var *vars = env_file_read(ef->path, ef->delim); - - envs_set(vars, c->env_expand); - envs_free(&vars); - free(ef); + envs_set(env_file_read(ef->path, ef->delim);, c->env_expand); } -/* Unset the environment variables matching glob d. NOTE: d is owned by - hook_envs_install()’s caller, so we do not free it. */ +/* Unset the environment variables matching glob d. */ void hook_envs_unset(struct container *c, void *d) { envs_unset((char *)d); diff --git a/bin/json.c b/bin/json.c index d0058658e..3826543b8 100644 --- a/bin/json.c +++ b/bin/json.c @@ -1,6 +1,8 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -8,8 +10,6 @@ #include #include -#include "config.h" - #include CJSON_H #include "core.h" @@ -64,7 +64,6 @@ const size_t READ_SZ = 16384; char **array_strings_json_to_c(cJSON *jarry, size_t *ct); void cdi_append(struct cdi_spec **specs, struct cdi_spec *spec); -void cdi_free(struct cdi_spec *spec); void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args); char *cdi_hook_to_string(const char *hook_name, char **args); void cdi_log(struct cdi_spec *spec); @@ -130,8 +129,8 @@ struct cdi_hook_dispatch cdi_hooks[] = { a freshly allocated NULL-terminated array of C strings (pointers to null-terminated chars buffers) and return that. ct is an out parameter - WARNING: This is a shallow copy, i.e., the actual strings are still owned - by the JSON array. */ + WARNING: This is a shallow copy, i.e., the actual strings are still shared + with the JSON array. */ char **array_strings_json_to_c(cJSON *jarry, size_t *ct) { size_t i; @@ -140,7 +139,7 @@ char **array_strings_json_to_c(cJSON *jarry, size_t *ct) Tf (cJSON_IsArray(jarry), "JSON: expected array"); *ct = cJSON_GetArraySize(jarry); - T_ (carry = malloc((*ct + 1) * sizeof(char *))); + carry = ch_malloc((*ct + 1) * sizeof(char *)); carry[*ct] = NULL; i = 0; @@ -176,22 +175,6 @@ struct env_var *cdi_envs_get(const char *devid) return NULL; } -/* Free spec. */ -void cdi_free(struct cdi_spec *spec) -{ - free(spec->kind); - free(spec->src_path); - for (size_t i = 0; spec->envs[i].name != NULL; i++) { - free(spec->envs[i].name); - free(spec->envs[i].value); - } - free(spec->envs); - for (size_t i = 0; spec->ldconfigs[i] != NULL; i++) - free(spec->ldconfigs[i]); - free(spec->ldconfigs); - free(spec); -} - void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args) { for (size_t i = 0; args[i] != NULL; i++) @@ -208,19 +191,13 @@ void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args) /* Return a freshly allocated string describing the given hook, for logging. */ char *cdi_hook_to_string(const char *hook_name, char **args) { - char *ret, *args_str; - - args_str = strdup(""); - for (size_t i = 0; args[i] != NULL; i++) { - char *as_old = args_str; - T_ (1 <= asprintf(&args_str, "%s %s", as_old, args[i])); - free(as_old); - } + char *args_str; - T_ (1 <= asprintf(&ret, "%s:%s", hook_name, args_str)); + args_str = ""; + for (size_t i = 0; args[i] != NULL; i++) + args_str = cats(3, as_old, " ", args[i]); - free(args_str); - return ret; + return ch_asprintf("%s:%s", hook_name, args_str)); } /* Read the CDI spec files we need. @@ -246,7 +223,6 @@ void cdi_init(struct cdi_config *cf) struct cdi_spec *spec = cdi_read_maybe(cdi_specs, cf->devids[i]); if (spec != NULL) list_append((void **)&cdi_specs, spec, sizeof(*spec)); - free(spec); } // Read CDI spec files in configured directories if neccessary. @@ -261,12 +237,8 @@ void cdi_init(struct cdi_config *cf) struct cdi_spec *spec = cdi_read_maybe(cdi_specs, path); if (spec != NULL && cdi_requested(cf, spec)) list_append((void **)&cdi_specs, spec, sizeof(*spec)); - free(path); - free(spec); } - free(des[j]); } - free(des); } // debugging: print parsed CDI specs @@ -307,8 +279,7 @@ void cdi_log(struct cdi_spec *spec) } /* Read and parse the CDI spec file at path. Return a pointer to the parsed - struct, which the caller is responsible for freeing. If something goes - wrong, exit with error. */ + struct. If something goes wrong, exit with error. */ struct cdi_spec *cdi_read(const char *path) { FILE *fp; @@ -323,11 +294,11 @@ struct cdi_spec *cdi_read(const char *path) Tf (fp = fopen(path, "rb"), "CDI: can't open: %s", path); Zf (fstat(fileno(fp), &st), "CDI: can't stat: %s", path); for (size_t used = 0, avail = READ_SZ; true; avail += READ_SZ) { - T_ (text = realloc(text, avail)); + text = ch_realloc(text, avail); size_t read_ct = fread(text + used, 1, READ_SZ, fp); used += read_ct; if (read_ct < READ_SZ) { - if (feof(fp)) { // EOF reached + if (feof(fp)) { // EOF reached text[used] = '\0'; // ensure string ended break; } @@ -340,16 +311,14 @@ struct cdi_spec *cdi_read(const char *path) Tf(tree != NULL, "CDI: JSON failed at byte %d: %s", parse_end - text, path); // Visit parse tree to build our struct. - T_ (spec = calloc(1, sizeof(struct cdi_spec))); - T_ (spec->src_path = strdup(path)); + spec = ch_malloc(sizeof(struct cdi_spec)); + spec->src_path = path; spec->src_dev = st.st_dev; spec->src_ino = st.st_ino; visit(cdiPD_root, tree, spec); // Clean up. VERBOSE("CDI: spec read OK: %s: %s", spec->kind, path); - free(text); - cJSON_Delete(tree); return spec; } @@ -463,9 +432,6 @@ void cdiPC_hook(cJSON *tree, struct cdi_spec *spec) if (!hook_known) WARNING("CDI: ignoring unknown hook: %s", hook_str); - - free(hook_str); - free(args); } void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) @@ -473,6 +439,27 @@ void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) T_ (spec->kind = strdup(tree->valuestring)); } +/* Initialize the cJSON stuff. Quirks: + + 1. Despite using reallocation internally, cJSON indeed does not accept a + realloc(3) replacement, though it possibly used to. If malloc(3) and + free(3) are provided, then it just doesn’t call any realloc(). + + Weirdly, cJSON appears to have a notion of “internal” memory management + that uses malloc(3), realloc(3), and free(3) regardless of these hooks. + + 2. cJSON prefixes everything with CJSON_CDECL, which is juts __cdecl, which + is unnecessary for C code. Maybe this is for using cJSON in C++? */ +void json_init(void) +{ + cJSON_Hooks hooks = (cJSON_Hooks) { + .malloc_fn = ch_malloc_pointerful, + .free_fn = ch_free_noop, + }; + + cJSON_InitHooks(&hooks); +} + /* Visit each node in the parse tree in depth-first order. At each node, if there is a matching callback in actions, call it. For arrays, call the callback once per array element. */ diff --git a/bin/json.h b/bin/json.h index 2de94d1d9..9675a7a3e 100644 --- a/bin/json.h +++ b/bin/json.h @@ -29,3 +29,4 @@ struct cdi_config { struct env_var *cdi_envs_get(const char *devid); void cdi_init(struct cdi_config *cf); +void json_init(void); diff --git a/bin/mem.c b/bin/mem.c index 1e576609d..eca1689af 100644 --- a/bin/mem.c +++ b/bin/mem.c @@ -56,6 +56,7 @@ #include #include #include +#include #ifdef HAVE_GC #include @@ -134,21 +135,38 @@ long time_collecting_prev = 0; be the fastest to format. */ char *ch_asprintf(const char *fmt, ...) { - va_list ap1, ap2; - int str_len; + va_list ap; char *str; - va_start(ap1, fmt); - va_copy(ap2, ap1); + va_start(ap); + str = ch_vasprintf(fmt, ap); + va_end(ap); - T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap1))); - str = ch_malloc(str_len + 1, false); - T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap2)); + return str; +} - va_end(ap1); - va_end(ap2); +/* Fork the process. In parent, return the PID of the child; in the child, + return 0. Cannot fail. - return str; + The main purpose of this wrapper is to do an aggressive garbage collection + prior to fork(2) so the child is a small as possible. */ +pid_t ch_fork(void) +{ + pid_t child; + + ch_memory_log("fork"); + garbageinate("fkgc"); + + child = fork(); + Tf (child >= 0, "can't fork"); + + return child; +} + +/* free(3)-alike that does nothing. Don’t call it. Provided for libraries that + let us hook memory allocation and de-allocation, e.g. cJSON. */ +void ch_free_noop(void *p) +{ } /* Return a new null-terminated string containing the next record from fp, @@ -225,6 +243,22 @@ void *ch_malloc(size_t size, bool pointerful) return buf; } +/* Like ch_malloc(), but same API as malloc(3). Prefer ch_malloc(). This is + provided for libraries that let us hook memory allocation and + de-allocation, e.g. cJSON. */ +void *ch_malloc(size_t size) +{ + return ch_malloc(size, true); +} + +/* Like ch_malloc(), but buffer contents are zeroed. */ +void ch_malloc_zeroed(size_t size, bool pointerful) +{ + void *buf = ch_malloc(size, pointerful); + memset(buf, 0, size); + return buf; +} + /* Shut down memory management. */ void ch_memory_exit(void) { @@ -367,9 +401,54 @@ void *ch_realloc(void *p, size_t size, bool pointerful) return p_new; } +/* Return a copy of s in a newly allocated, pointerless buffer. Cannot fail. + + Note: Unlike strdup(3), ch_strdup() is only needed if you need to actually + modify the copy. It should not be used to simplify memory management. */ +char *ch_strdup(const char *s) +{ + char *dst; + +#ifdef HAVE_GC + dst = GC_STRDUP(s); +#else + dst = strdup(s); +#endif + + T_ (dst); + return dst; +} + +/* Like ch_asprintf(), but takes and consumes a va_list pointer. */ +char *ch_vasprintf(const char *fmt, va_list ap) +{ + va_list ap2; + int str_len; + char *str; + + va_copy(ap2, ap); + + T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap))); + str = ch_malloc(str_len + 1, false); + T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap2)); + + va_end(ap2); + + return str; +} + +/* If linked with libgc, do a maximum-effort garbage collection; otherwise, do + nothing. Use when to tag memory logging. */ +void garbageinate(const char *when) +{ +#ifdef HAVE_GC + GC_collect_and_unmap(); + ch_memory_log(when); +#endif +} + /* Convert a signed number of bytes to kilobytes (truncated) and return it. */ ssize_t kB(ssize_t byte_ct) { return byte_ct / 1024; } - diff --git a/bin/mem.h b/bin/mem.h index a6dcd66b2..63279691d 100644 --- a/bin/mem.h +++ b/bin/mem.h @@ -5,15 +5,21 @@ #include #include +#include /** Function prototypes **/ char *ch_asprintf(const char *fmt, ...); +pid_t ch_fork(void); +void ch_free_noop(void *p); char *ch_getdelim(FILE *fp, char delim); void ch_memory_exit(void); void ch_memory_init(void); void ch_memory_log(const char *when); void *ch_malloc(size_t size, bool pointerful); +void *ch_malloc_pointerful(size_t size); +void *ch_malloc_zeroed(size_t size, bool pointerful); void *ch_realloc(void *p, size_t size, bool pointerful); char *ch_strdup(const char *src); -void garbageinate(void); +char *ch_vasprintf(const char *fmt, va_list ap); +void garbageinate(const char *when); diff --git a/bin/misc.c b/bin/misc.c index cf5317bd4..31ab63bae 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -1,6 +1,8 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -16,7 +18,6 @@ #include #include -#include "config.h" #include "misc.h" @@ -64,7 +65,8 @@ static const char **LL_COLOURS = _LL_COLOURS + 3; /** External variables **/ -/* If true, exit abnormally on fatal error. Set in ch-run.c. */ +/* If true, exit abnormally on fatal error. Set in ch-run.c during argument + parsing, so will always be default value before that. */ bool abort_fatal = false; /* If true, use colored logging. Set in ch-run.c. */ @@ -115,7 +117,8 @@ char *argv_to_string(char **argv) bool quote_p = false; // Max length is escape every char plus two quotes and terminating zero. - T_ (argv_ = calloc(2 * strlen(argv[i]) + 3, 1)); + // Initialize to zeroes so we don’t have to terminate string later. + argv_ = ch_malloc_zeroed(2 * strlen(argv[i]) + 3, false); // Copy to new string, escaping as we go. Note lots of fall-through. I'm // not sure where this list of shell meta-characters came from; I just @@ -158,22 +161,8 @@ char *argv_to_string(char **argv) } } - if (quote_p) { - x = argv_; - T_ (1 <= asprintf(&argv_, "\"%s\"", argv_)); - free(x); - } - - if (i != 0) { - x = s; - s = cat(s, " "); - free(x); - } - - x = s; - s = cat(s, argv_); - free(x); - free(argv_); + s = cats(5, s, i == 0 ? "" : " ", + quote_p ? "\"" : "", argv_, quote_p ? "\""); } return s; @@ -197,9 +186,9 @@ const char *bool_to_string(bool b) 3. The buffer contains no empty strings. These assumptions are consistent with the construction of the “warnings” - shared memory buffer, which is the main justification for this function. Note - that under these assumptions, the final byte in the buffer is guaranteed to - be null. */ + shared memory buffer, which is the main justification for this function. + Note that under these assumptions, the final byte in the buffer is + guaranteed to be null. */ int buf_strings_count(char *buf, size_t size) { int count = 0; @@ -225,50 +214,102 @@ bool buf_zero_p(void *buf, size_t size) return true; } -/* Concatenate strings a and b into a newly-allocated buffer and return the a +/* Concatenate strings a and b into a newly-allocated buffer and return a pointer to this buffer. */ char *cat(const char *a, const char *b) { - char *ret; - if (a == NULL) - a = ""; - if (b == NULL) - b = ""; - T_ (asprintf(&ret, "%s%s", a, b) == strlen(a) + strlen(b)); - return ret; + return cats(2, a, b); } -/* Like scandir(3), but (1) filter excludes “.” and “..”, (2) results are not - sorted, and (3) cannot fail (exits with an error instead). */ -int dir_ls(const char *path, struct dirent ***namelist) +/* Concatenate argc strings into a newly allocated buffer and return a pointer + to this buffer. If argc is zero, return the empty string. NULL pointers are + treated as empty strings. */ +char *cats(size_t argc, ...) { - int entry_ct; + char *ret, *next; + size_t ret_len; + char **argv; + size_t *argv_lens; + va_list ap; - entry_ct = scandir(path, namelist, dir_ls_filter, NULL); - Tf (entry_ct >= 0, "can't scan dir: %s", path); - return entry_ct; + argv = ch_malloc(argc * sizeof(char *), true); + argv_lens = ch_malloc(argc * sizeof(size_t), false); + + // compute buffer size and convert NULLs to empty string + va_start(ap, argc); + ret_len = 1; // for terminator + for (int i = 0; i < argc; i++) + { + char *arg = va_arg(ap); + if (arg == NULL) { + argv[i] = ""; + argv_lens[i] = 0; + } else { + argv[i] = arg; + argv_lens[i] = strlen(arg); + } + ret_len += argv_lens[i]; + } + va_end(ap); + + // copy strings + ret = ch_malloc(ret_len, false); + next = ret; + for (int i = 0; i < argc; i++) { + memcpy(next, argv[i], argv_lens[i]); + next += argv_lens[i]; + } + ret[ret_len] = '\0'; + + return ret; } -/* Return the number of entries in directory path, not including “.” and “..”; - i.e., the empty directory returns 0 despite them. */ -int dir_ls_count(const char *path) -{ - int ct; - struct dirent **namelist; +/* Return a newly-allocated, null-terminated list of filenames in directory + path that match fnmatch(3)-pattern glob, excluding “.” and “..”. For a list + of everything, pass "*" for glob. Leading dots *do* match “*”. - ct = dir_ls(path, &namelist); - for (size_t i = 0; i < ct; i++) - free(namelist[i]); - free(namelist); + We use readdir(3) rather than scandir(3) because the latter allocates + memory with malloc(3). */ +char **dir_glob(const char *path, const char *glob) +{ + DIR *dp; + int i; // index of next free array element + size_t alloc_ct = 16; + char **entries = ch_malloc(alloc_ct * sizeof(char *), true); + + Tf (dp = opendir(path), "can't open directory: %s", path); + i = 0; + while (true) { + struct dirent *entry; + int matchp; + errno = 0; + entry = readdir(dp); + if (entry == NULL) { + Zf (errno, "can’t read directory: %s", path); + break; // EOF + } + matchp = fnmatch(glob, entry->d_name, FNM_EXTMATCH); + if (matchp != 0) { + T_ (matchp == FNM_NOMATCH); // error? + continue; // no match, skip + } + if (i >= alloc_ct - 1) { + alloc_ct *= 2; + entries = ch_realloc(allot_ct * sizeof(char *), true); + } + entries[i] = entry->d_name; + i++; + } + entries[i] = NULL; + Zf (closedir(dp), "can't close directory: %s", path); - return ct; + return entries; } -/* scandir(3) filter that excludes “.” and “..”: Return 0 if e->d_name is one - of those strings, else 1. */ -int dir_ls_filter(const struct dirent *e) +/* Return the number of matches for glob in path. */ +int dir_glob_count(const char *path, const char *glob) { - return !(!strcmp(e->d_name, ".") || !strcmp(e->d_name, "..")); + return list_count((void **)dir_glob(path, glob), sizeof(char *)); } /* Read the file listing environment variables at path, with records separated @@ -293,18 +334,14 @@ struct env_var *env_file_read(const char *path, int delim) vars = list_new(sizeof(struct env_var), 0); for (size_t line_no = 1; true; line_no++) { struct env_var var; - char *line = NULL; - size_t line_len = 0; // don't care but required by getline(3) + char *line; errno = 0; - if (-1 == getdelim(&line, &line_len, delim, fp)) { - if (errno == 0) // EOF - break; - else - Tf (0, "can't read: %s", path); - } - if (line[strlen(line) - 1] == '\n') // rm newline if present + line = ch_getdelim(fp, delim, fp); + if (line == NULL) // EOF + break; + if (line[strlen(line) - 1] == (char)delim) // rm delimiter if present line[strlen(line) - 1] = 0; - if (line[0] == 0) // skip blank lines + if (line[0] == '\0') // skip blank lines continue; var = env_var_parse(line, path, line_no); list_append((void **)&vars, &var, sizeof(var)); @@ -315,11 +352,7 @@ struct env_var *env_file_read(const char *path, int delim) } /* Return the value of environment variable name if set; otherwise, return - value_default instead. - - Note the implications for memory management: you may get a pointer into - environ (?), which you do not own and must not free, or value_default, - which may or may not need to be freed. */ + value_default instead. */ char *env_get(const char *name, char *value_default) { char *ret = getenv(name); @@ -327,7 +360,6 @@ char *env_get(const char *name, char *value_default) } - /* Set environment variable name to value. If expand, then further expand variables in value marked with "$" as described in the man page. */ void env_set(const char *name, const char *value, const bool expand) @@ -339,10 +371,10 @@ void env_set(const char *name, const char *value, const bool expand) char *vwk_cur; // current location in vwk char *vout = NULL; // output (expanded) string bool first_out = false; // true after 1st output element written - T_ (vwk = strdup(value)); + vwk = ch_strdup(value); vwk_cur = vwk; while (true) { // loop executes ≥ once - char *elem = strsep(&vwk_cur, ":"); // NULL -> no more elements + char *elem = strsep(&vwk_cur, ":"); // NULL -> no more elements if (elem == NULL) break; if (elem[0] == '$' && elem[1] != 0) { // looks like $VARIABLE @@ -351,11 +383,8 @@ void env_set(const char *name, const char *value, const bool expand) elem = NULL; // convert to unset } if (elem != NULL) { // empty -> omit from output list - char *vout_old = vout; - T_ (1 <= asprintf(&vout, "%s%s%s", vout_old ? vout_old : "", - !first_out ? ":" : "", elem)); + vout = cats(3, vout, first_out ? "" : ":", elem); first_out = true; - free(vout_old); } } value = vwk; @@ -364,18 +393,6 @@ void env_set(const char *name, const char *value, const bool expand) // Save results. DEBUG("environment: %s=%s", name, value); Z_ (setenv(name, value, 1)); - free(vwk); -} - -/* Free the environment variabls list *vars, both the individual buffers within - as well as the whole list, then set *vars to NULL. */ -void envs_free(struct env_var **vars) -{ - for (int i = 0; (*vars)[i].name != NULL; i++) - free((*vars)[i].name); // .value points into same buffer; see split() - - free(*vars); - *vars = NULL; } void envs_set(const struct env_var *vars, const bool expand) @@ -391,8 +408,8 @@ void envs_set(const struct env_var *vars, const bool expand) O(n^2) search until no matches remain. Our approach is O(n): we build up a copy of environ, skipping variables - that match the glob, and then assign environ to the copy. (This is a valid - thing to do [2].) + that match the glob, and then assign environ to the copy. This is a valid + thing to do [2]. [1]: https://unix.stackexchange.com/a/302987 [2]: http://man7.org/linux/man-pages/man3/exec.3p.html */ @@ -404,7 +421,7 @@ void envs_unset(const char *glob) int matchp; split(&name, &value, environ[i], '='); T_ (name != NULL); // environ entries must always have equals - matchp = fnmatch(glob, name, FNM_EXTMATCH); // extglobs if available + matchp = fnmatch(glob, name, FNM_EXTMATCH); // extglobs if available if (matchp == 0) { DEBUG("environment: unset %s", name); } else { @@ -424,17 +441,15 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) { char *name, *value, *where; - if (path == NULL) { - T_ (where = strdup(line)); - } else { - T_ (1 <= asprintf(&where, "%s:%zu", path, lineno)); - } + if (path == NULL) + where = ch_strdup(line); + else + where = ch_asprintf("%s:%zu", path, lineno)); // Split line into variable name and value. split(&name, &value, line, '='); Te (name != NULL, "can't parse variable: no delimiter: %s", where); Te (name[0] != 0, "can't parse variable: empty name: %s", where); - free(where); // for Tim // Strip leading and trailing single quotes from value, if both present. if ( strlen(value) >= 2 @@ -453,17 +468,21 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) list to the new location. *list can be NULL to initialize a new list. Return the new array size. - Note: ar must be cast, e.g. "list_append((void **)&foo, ...)". + Usage note: ar must be cast, e.g. "list_append((void **)&foo, ...)". + + Implementation note: We could round up the new size to the next power of + two for allocation purposes, which would reduce the number of realloc() + that actually change the size. However, many allocators do this type of + thing internally already, and that seems a better place for it. Warning: This function relies on all pointers having the same representation, which is true on most modern machines but is not guaranteed by the standard [1]. We could instead return the new value of ar rather than using an out parameter, which would avoid the double pointer and associated non-portability but make it easy for callers to create dangling - pointers, i.e., after "a = list_append(b, ...)", b will dangle. That - problem could in turn be avoided by returning a *copy* of the array rather - than a modified array, but then the caller has to deal with the original - array itself. It seemed to me the present behavior was the best trade-off. + pointers, i.e., after “a = list_append(b, ...)”, b will be invalid. This + isn’t just about memory leaks but also the fact that b points to an invalid + buffer that likely *looks* valid. [1]: http://www.c-faq.com/ptrs/genericpp.html */ void list_append(void **ar, void *new, size_t size) @@ -472,7 +491,7 @@ void list_append(void **ar, void *new, size_t size) T_ (new != NULL); ct = list_count(*ar, size); - T_ (*ar = realloc(*ar, (ct+2)*size)); // existing + new + terminator + *ar = ch_realloc(*ar, (ct+2)*size, true)); // existing + new + terminator memcpy(*ar + ct*size, new, size); // append new (no overlap) memset(*ar + (ct+1)*size, 0, size); // set new terminator } @@ -485,12 +504,13 @@ void list_cat(void **dst, void *src, size_t size) ct_dst = list_count(*dst, size); ct_src = list_count(src, size); - T_ (*dst = realloc(*dst, (ct_dst+ct_src+1)*size)); + *dst = ch_realloc(*dst, (ct_dst+ct_src+1)*size, true); memcpy(*dst + ct_dst*size, src, ct_src*size); // append src (no overlap) memset(*dst + (ct_dst+ct_src)*size, 0, size); // set new terminator } -/* Return the number of elements of size size in list *ar. */ +/* Return the number of elements of size size in list *ar, not including the + terminating zero element. */ size_t list_count(void *ar, size_t size) { size_t ct; @@ -503,29 +523,17 @@ size_t list_count(void *ar, size_t size) return ct; } -/* *ar is a list of pointers to malloc()’ed buffers (which is why object size - is not provided). Free those buffers, then free *ar itself and set it to - NULL. */ -void list_free_shallow(void ***ar) -{ - T_ (*ar != NULL); - for (int i; (*ar)[i] != NULL; i++) - free((*ar)[i]); - free(*ar); - *ar = NULL; -} - /* Return a pointer to a new, empty zero-terminated array containing elements of size size, with room for ct elements without re-allocation. The latter allows to pre-allocate an arbitrary number of slots in the list, which can - then be filled directly without testing the list's length for each one. + then be filled directly without testing the list’s length for each one. (The list is completely filled with zeros, so every position has a terminator after it.) */ void *list_new(size_t size, size_t ct) { void *list; T_ (size > 0); - T_ (list = calloc(ct+1, size)); + T_ (list = ch_malloc_zeroed(ct+1, size, true)); return list; } @@ -533,23 +541,11 @@ void *list_new(size_t size, size_t ct) treated as one). Copy each token into a newly-allocated string buffer, and return these strings as a new list. - Notes: - - 1. The interface deliberately accepts a single delimiter, not multiple - like strtok(3). - - 2. This approach has a redundant malloc(3) for each token, because we - have to copy the input string into a new buffer anyway to satisfy - strtok_r(3). We could use the multiple token pointers into this single - buffer as the list elements. However, this would yield a - difficult-to-free list: one would have to free only the *first* - element in the list and no others. Also, if any other strings are - later added to the list, those would need to be freed differently. - This all seemed extremely bug-prone. */ + The function accepts a single delimiter, not multiple like strtok(3). */ void *list_new_strings(char delim, const char *str) { char **list; - char *str_copy, *str_init, *tok_state; + char *str_, *tok_state; char delims[] = { delim, '\0' }; size_t delim_ct = 0; @@ -558,58 +554,26 @@ void *list_new_strings(char delim, const char *str) // adjacent delimiters and thus may overcount tokens, possibly wasting a // small amount of memory. for (int i = 0; str[i] != '\0'; i++) - delim_ct += str[i] == delim ? 1 : 0; + delim_ct += (str[i] == delim ? 1 : 0); list = list_new(delim_ct + 1, sizeof(char *)); // Note: strtok_r(3)’s interface is rather awkward; see its man page. - T_ (str_copy = strdup(str)); - str_init = str_copy; + str_ = ch_strdup(str); // so we can modify it tok_state = NULL; for (int i = 0; true; i++) { char *tok; - tok = strtok_r(str_init, delims, &tok_state); + tok = strtok_r(str_, delims, &tok_state); if (tok == NULL) break; T_ (i < delim_ct + 1); // bounds check - T_ (tok = strdup(tok)); // copy tok into buffer we own list[i] = tok; - str_init = NULL; + str_ = NULL; // only pass actual string on first call } - free(str_copy); return list; } -/* Remove any duplicate elements in ar, in-place, according to comparison - function cmp. The last duplicate in the list wins. Preserves order - otherwise. */ -void list_uniq(void *ar, size_t size, comparison_fn_t cmp) -{ - size_t rm_ct; - size_t ct_starting = list_count(ar, size); - void *zero_blk = ar + ct_starting * size; // assumes terminated correctly - - // Loop backwards through array; set duplicates to zero. We could instead - // bubble out the duplicates here, but I felt keeping track of indices - // would be too hard. - for (int i = ct_starting - 1; i > 0; i--) { // ar[0] has nothing prior - if (memcmp(ar + i * size, zero_blk, size)) // if not already deleted - for (int j = i - 1; j >= 0; j--) - if (!cmp(ar + i * size, ar + j * size)) - memset(ar + j * size, 0, size); - } - // Loop forwards through array, shifting each item backwards the number of - // zero blocks we’ve seen so far. - rm_ct = 0; - for (int i = 0; i < ct_starting; i++) - if (!memcmp(ar + i * size, zero_blk, size)) // ar[i] deleted - rm_ct++; - else if (rm_ct > 0) - memcpy(ar + (i - rm_ct) * size, ar + i * size, size); - memset(ar + (ct_starting - rm_ct) * size, 0, size); // terminate -} - /* If verbose enough, print uids and gids on stderr prefixed with where. FIXME: Should change to DEBUG(), but that will give the file/line within @@ -645,9 +609,8 @@ void log_ids(const char *func, int line) } } - -/* Set up logging. Note ch-run(1) specifies a bunch of - color synonyms; this translation happens during argument parsing.*/ +/* Set up logging. Note ch-run(1) specifies a bunch of color synonyms; this + translation happens during argument parsing.*/ void logging_init(enum log_color_when when, enum log_test test) { // set up colors @@ -667,7 +630,7 @@ void logging_init(enum log_color_when when, enum log_test test) log_color_p = false; break; case LL_COLOR_NULL: - Tf(0, "unreachable code reached"); + T_ (0); // unreachable break; } @@ -700,9 +663,9 @@ void mkdir_overmount(const char *path, const char *scratch) struct dirent **entries; VERBOSE("making writeable via symlink ranch: %s", path); - path2 = strdup(path); + path2 = ch_strdup(path); parent = dirname(path2); - T_ (1 <= asprintf(&over, "%s/%d", scratch, dir_ls_count(scratch) + 1)); + over = ch_asprintf("%s/%d", scratch, dir_ls_count(scratch) + 1); path_dst = path_join(over, orig_dir); // bind-mounts @@ -714,25 +677,15 @@ void mkdir_overmount(const char *path, const char *scratch) "can't bind-mount: %s- > %s", over, parent); // symlink ranch - entry_ct = dir_ls(path_dst, &entries); + entry_ct = dir_glob_count(path_dst, "*"); DEBUG("existing entries: %d", entry_ct); for (int i = 0; i < entry_ct; i++) { char * src = path_join(parent, entries[i]->d_name); char * dst = path_join(orig_dir, entries[i]->d_name); - Zf (symlink(dst, src), "can't symlink: %s -> %s", src, dst); - - free(src); - free(dst); - free(entries[i]); } - free(entries); Zf (mkdir(path, 0755), "can't mkdir even after overmount: %s", path); - - free(path_dst); - free(over); - free(path2); } /* Create directories in path under base. Exit with an error if anything goes @@ -761,17 +714,16 @@ void mkdirs(const char *base, const char *path, char **denylist, TRACE("mkdirs: base: %s", basec); TRACE("mkdirs: path: %s", path); - for (size_t i = 0; denylist[i] != NULL; i++) + for (int i = 0; denylist[i] != NULL; i++) TRACE("mkdirs: deny: %s", denylist[i]); - pathw = cat(path, ""); // writeable copy - saveptr = NULL; // avoid warning (#1048; see also strtok_r(3)) + pathw = ch_strdup(path); // writeable copy + saveptr = NULL; // avoid warning (#1048; see also strtok_r(3)) component = strtok_r(pathw, "/", &saveptr); nextc = basec; next = NULL; while (component != NULL) { - next = cat(nextc, "/"); - next = cat(next, component); // canonical except for last component + next = path_join(nextc, component); // canonical except for last TRACE("mkdirs: next: %s", next) component = strtok_r(NULL, "/", &saveptr); // next NULL if current last if (path_exists(next, &sb, false)) { @@ -818,7 +770,7 @@ void msg(enum log_level level, const char *file, int line, int errno_, } noreturn void msg_fatal(const char *file, int line, int errno_, - const char *fmt, ...) + const char *fmt, ...) { va_list ap; @@ -837,7 +789,6 @@ void msgv(enum log_level level, const char *file, int line, int errno_, const char *fmt, va_list ap) { // note: all components contain appropriate leading/trailing space - // note: be careful about which components need to be freed char *text_formatted; // caller’s message, formatted char *level_prefix; // level prefix char *errno_code; // errno code/number @@ -846,14 +797,14 @@ void msgv(enum log_level level, const char *file, int line, int errno_, const char * colour; // ANSI codes for color const char * colour_reset; // ANSI codes to reset color - if (level > verbose) // not verbose enough to log message; do nothing + if (level > verbose) // not verbose enough; do nothing return; // Format caller message. if (fmt == NULL) text_formatted = "please report this bug"; // users should not see else - T_ (1 <= vasprintf(&text_formatted, fmt, ap)); + text_formatted = ch_vasprintf(fmt, ap); // Prefix some of the levels. switch (level) { @@ -874,7 +825,7 @@ void msgv(enum log_level level, const char *file, int line, int errno_, errno_desc = ""; } else { errno_code = cat(" ", strerrorname_np(errno_)); // FIXME: non-portable - T_ (1 <= asprintf(&errno_desc, ": %s", strerror(errno_))); + errno_desc = ch_asprintf(": %s", strerror(errno_)); } // Color. @@ -887,25 +838,16 @@ void msgv(enum log_level level, const char *file, int line, int errno_, }; // Format and print. - T_ (1 <= asprintf(&text_full, "%s[%d]: %s%s%s (%s:%d%s)", - program_invocation_short_name, getpid(), - level_prefix, text_formatted, errno_desc, - file, line, errno_code)); + text_full = ch_asprintf("%s[%d]: %s%s%s (%s:%d%s)", + program_invocation_short_name, getpid(), + level_prefix, text_formatted, errno_desc, + file, line, errno_code)); fprintf(stderr, "%s%s%s\n", colour, text_full, colour_reset); if (fflush(stderr)) - abort(); // can't print an error b/c already trying to do that + abort(); // can’t print an error b/c already trying to do that if (level == LL_WARNING) warnings_offset += string_append(warnings, text_full, WARNINGS_SIZE, warnings_offset); - - // Clean up. - free(text_full); - if (errno_) { - free(errno_code); - free(errno_desc); - } - if (fmt != NULL) - free(text_formatted); } /* Return true if the given path exists, false otherwise. On error, exit. If @@ -934,27 +876,23 @@ bool path_exists(const char *path, struct stat *statbuf, bool follow_symlink) /* Concatenate paths a and b, then return the result. */ char *path_join(const char *a, const char *b) { - char *ret; - T_ (a != NULL); T_ (strlen(a) > 0); T_ (b != NULL); T_ (strlen(b) > 0); - T_ (asprintf(&ret, "%s/%s", a, b) == strlen(a) + strlen(b) + 1); - - return ret; + return ch_asprintf("%s/%s", a, b); } /* Return the mount flags of the file system containing path, suitable for passing to mount(2). - This is messy because, the flags we get from statvfs(3) are ST_* while the + This is messy because the flags we get from statvfs(3) are ST_* while the flags needed by mount(2) are MS_*. My glibc has a comment in bits/statvfs.h - that the ST_* "should be kept in sync with" the MS_* flags, and the values + that the ST_* “should be kept in sync with” the MS_* flags, and the values do seem to match, but there are additional undocumented flags in there. - Also, the kernel contains a test "unprivileged-remount-test.c" that - manually translates the flags. Thus, I wasn't comfortable simply passing + Also, the kernel contains a test “unprivileged-remount-test.c” that + manually translates the flags. Thus, I wasn’t comfortable simply passing the output of statvfs(3) to mount(2). */ unsigned long path_mount_flags(const char *path) { @@ -992,18 +930,10 @@ unsigned long path_mount_flags(const char *path) that output. */ void path_split(const char *path, char **dir, char **base) { - char *path2; - - if (dir != NULL) { - T_ (path2 = strdup(path)); - T_ (*dir = strdup(dirname(path2))); - free(path2); - } + if (dir != NULL) + *dir = dirname(ch_strdup(path)); if (base != NULL) { - T_ (path2 = strdup(path)); - T_ (*base = strdup(basename(path2))); - free(path2); - } + *base = basename(ch_strdup(path)); } /* Return true if path is a subdirectory of base, false otherwise. Acts on the @@ -1048,7 +978,7 @@ char *realpath_(const char *path, bool fail_ok) if (pathc == NULL) { if (fail_ok) { - T_ (pathc = strdup(path)); + pathc = ch_strdup(path)); } else { Tf (false, "can't canonicalize: %s", path); } @@ -1068,32 +998,23 @@ void replace_char(char *s, char old, char new) /* Split string str at first instance of delimiter del. Set *a to the part before del, and *b to the part after. Both can be empty; if no token is present, set both to NULL. Unlike strsep(3), str is unchanged; *a and *b - point into a new buffer allocated with malloc(3). This has two - implications: (1) the caller must free(3) *a but not *b, and (2) the parts - can be rejoined by setting *(*b-1) to del. The point here is to provide an - easier wrapper for strsep(3). */ + point into a new buffer. Therefore, the parts can be rejoined by setting + *(*b-1) to del. The point here is to provide an easier wrapper for + strsep(3). */ void split(char **a, char **b, const char *str, char del) { - char *tmp; char delstr[2] = { del, 0 }; T_ (str != NULL); - tmp = strdup(str); - *b = tmp; + *b = ch_strdup(str); *a = strsep(b, delstr); if (*b == NULL) *a = NULL; } -/* Report the version number. */ -void version(void) -{ - fprintf(stderr, "%s\n", VERSION); -} - -/* Append null-terminated string “str” to the memory buffer “offset” bytes after - from the address pointed to by “addr”. Buffer length is “size” bytes. Return - the number of bytes written. If there isn’t enough room for the string, do - nothing and return zero. */ +/* Append null-terminated string “str” to the memory buffer “offset” bytes + after from the address pointed to by “addr”. Buffer length is “size” bytes. + Return the number of bytes written. If there isn’t enough room for the + string, do nothing and return zero. */ size_t string_append(char *addr, char *str, size_t size, size_t offset) { size_t written = strlen(str) + 1; @@ -1104,6 +1025,12 @@ size_t string_append(char *addr, char *str, size_t size, size_t offset) return written; } +/* Report the version number. */ +void version(void) +{ + fprintf(stderr, "%s\n", VERSION); +} + /* Reprint messages stored in “warnings” memory buffer. */ void warnings_reprint(void) { diff --git a/bin/misc.h b/bin/misc.h index 6d9dce82b..b6288f0ef 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -129,23 +129,20 @@ const char *bool_to_string(bool b); int buf_strings_count(char *str, size_t s); bool buf_zero_p(void *buf, size_t size); char *cat(const char *a, const char *b); -int dir_ls(const char *path, struct dirent ***namelist); -int dir_ls_count(const char *path); -int dir_ls_filter(const struct dirent *e); +char *cats(size_t argc, ...); +char **dir_glob(const char *path, const char *glob); +int dir_glob_count(const char *path, const char *glob); struct env_var *env_file_read(const char *path, int delim); char *env_get(const char *name, char *value_default); void env_set(const char *name, const char *value, const bool expand); -void envs_free(struct env_var **vars); void envs_set(const struct env_var *envs, const bool expand); void envs_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); void list_cat(void **dst, void *src, size_t size); size_t list_count(void *ar, size_t size); -void list_free_shallow(void ***ar); void *list_new_strings(char delim, const char *s); void *list_new(size_t size, size_t ct); -void list_uniq(void *ar, size_t size, comparison_fn_t cmp); void log_ids(const char *func, int line); void logging_init(enum log_color_when when, enum log_test test); void test_logging(bool fail); diff --git a/bin/seccomp.c b/bin/seccomp.c index e027f957a..2d0fa9057 100644 --- a/bin/seccomp.c +++ b/bin/seccomp.c @@ -3,6 +3,8 @@ This interface contains the seccomp filter for root emulation. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -15,6 +17,7 @@ #include "core.h" #include "hook.h" +#include "mem.h" /** Macros **/ @@ -151,7 +154,7 @@ void hook_seccomp_install(struct container *c, void *d) + ct_mknod // mknod(2) handling + ct_mknodat); // mknodat(2) handling DEBUG("seccomp: filter program has %d instructions", p.len); - T_ (p.filter = calloc(p.len, sizeof(struct sock_filter))); + p.filter = ch_malloc(p.len * sizeof(struct sock_filter), false); // Return call addresses. Allow needs to come first because we’ll jump to // it for unknown architectures. From bc811d1f6b2cc10412dd96b8062b5d37c740ed3e Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Mon, 30 Sep 2024 17:50:24 -0600 Subject: [PATCH 28/29] make it build and run [skip ci] --- bin/Makefile.am | 6 +-- bin/ch-run.c | 8 ++-- bin/core.c | 7 ++-- bin/fuse.c | 7 ++-- bin/hook.c | 2 +- bin/json.c | 44 ++++++++++---------- bin/mem.c | 72 ++++++++++++++++++--------------- bin/misc.c | 42 +++++++++---------- bin/misc.h | 12 +++--- bin/seccomp.c | 2 +- configure.ac | 104 ++++++++++++++++++++++-------------------------- doc/dev.rst | 22 ++++++---- 12 files changed, 165 insertions(+), 163 deletions(-) diff --git a/bin/Makefile.am b/bin/Makefile.am index c21766217..d96ef4502 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -6,7 +6,7 @@ bin_PROGRAMS = ch-checkns ch-run -ch_checkns_SOURCES = ch-checkns.c misc.h misc.c +ch_checkns_SOURCES = ch-checkns.c mem.h mem.c misc.h misc.c ch_run_SOURCES = ch-run.c core.h core.c hook.h hook.c mem.h mem.c misc.h misc.c if HAVE_JSON @@ -19,10 +19,6 @@ if HAVE_SECCOMP ch_run_SOURCES += seccomp.h seccomp.c endif -# additional build flags for ch-run -ch_run_CFLAGS = $(PTHREAD_CFLAGS) -ch_run_LDADD = $(CH_RUN_LIBS) - ## Shell scripts - distributed as-is diff --git a/bin/ch-run.c b/bin/ch-run.c index 332180d15..f5c27a2ed 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -408,7 +408,7 @@ void hooks_env_install(struct args *args) struct env_file *ef; name = "env-set-gfile"; f = hook_envs_set_file; - ef = ch_malloc(sizeof(struct env_file)); + ef = ch_malloc(sizeof(struct env_file), true); ef->path = arg; ef->delim = delim; ef->expand = args->c.env_expand; @@ -638,7 +638,6 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) } break; case -19: // --cdi-dirs Te (strlen(arg) > 0, "--cdi-dirs: PATHS must be non-empty"); - list_free_shallow((void ***)&args->cdi.spec_dirs); args->cdi.spec_dirs = list_new_strings(':', arg); break; #endif @@ -662,7 +661,8 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) case 'b': { // --bind char *src, *dst; i = list_count(args->c.binds, sizeof(args->c.binds[0])); - args->c.binds = ch_realloc(args->c.binds, (i+2) * sizeof(struct bind)); + args->c.binds = ch_realloc(args->c.binds, (i+2) * sizeof(struct bind), + true); memset(&args->c.binds[i+1], 0, sizeof(args->c.binds[0])); // terminate args->c.binds[i].dep = BD_MAKE_DST; // source @@ -792,7 +792,7 @@ char *storage_default(void) char *storage = getenv("CH_IMAGE_STORAGE"); if (storage == NULL) - storage = ch_asprintf("/var/tmp/%s.ch", username)); + storage = ch_asprintf("/var/tmp/%s.ch", username); return storage; } diff --git a/bin/core.c b/bin/core.c index 29f383b2c..41c01fe50 100644 --- a/bin/core.c +++ b/bin/core.c @@ -289,7 +289,6 @@ enum img_type image_type(const char *ref, const char *storage_dir) char *img_name2path(const char *name, const char *storage_dir) { - char *path; char *name_fs = ch_strdup(name); replace_char(name_fs, '/', '%'); @@ -377,7 +376,7 @@ void mounts_setup(struct container *c) // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do // need both calls to avoid pivot_root(2) failing with EBUSY later. - DEBUG("claiming new root for this namespace") + DEBUG("claiming new root for this namespace"); bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); // Re-mount new root read-only unless --write or already read-only. @@ -405,7 +404,7 @@ void mounts_setup(struct container *c) Z_ (mkdir(mkdir_scratch, 0700)); options = ch_asprintf(("lowerdir=%s,upperdir=%s,workdir=%s," "index=on,userxattr,volatile"), - c->newroot, WF_MNT "/upper", WF_MNT "/work")); + c->newroot, WF_MNT "/upper", WF_MNT "/work"); // update newroot Zf (stat(c->newroot, &st), "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); @@ -442,7 +441,7 @@ void namespace_join(pid_t pid, const char *ns) char *path; int fd; - path = ch_asprintf(&path, "/proc/%d/ns/%s", pid, ns); + path = ch_asprintf("/proc/%d/ns/%s", pid, ns); fd = open(path, O_RDONLY); if (fd == -1) { if (errno == ENOENT) { diff --git a/bin/fuse.c b/bin/fuse.c index 0d0186379..a6b0bc1da 100644 --- a/bin/fuse.c +++ b/bin/fuse.c @@ -38,6 +38,7 @@ #include "config.h" // here to avoid potential clash with SquashFUSE config.h #include "core.h" #include "fuse.h" +#include "mem.h" #include "misc.h" @@ -121,7 +122,7 @@ void sq_fork(struct container *c) // Default mount point? if (c->newroot == NULL) { - char *subdir = asprintf("/%s.ch/mnt", username); + char *subdir = ch_asprintf("/%s.ch/mnt", username); c->newroot = cat("/var/tmp", subdir); VERBOSE("using default mount point: %s", c->newroot); mkdirs("/var/tmp", subdir, NULL, NULL); @@ -202,7 +203,7 @@ int sq_loop(void) // [1]: https://codereview.stackexchange.com/a/109349 // [2]: https://man7.org/linux/man-pages/man2/wait.2.html exit_code = 1; - VERBOSE("child terminated by signal %d", WTERMSIG(child_status)) + VERBOSE("child terminated by signal %d", WTERMSIG(child_status)); } } @@ -227,7 +228,7 @@ void sq_mount(const char *img_path, char *mountpt) struct fuse_args mount_args = FUSE_ARGS_INIT(mount_argc, mount_argv); sq.mountpt = mountpt; - sq.chan = ch_malloc(sizeof(sqfs_ll_chan)); + sq.chan = ch_malloc(sizeof(sqfs_ll_chan), true); sq.ll = sqfs_ll_open(img_path, 0); Te (sq.ll != NULL, "can't open SquashFS: %s; try ch-run -vv?", img_path); diff --git a/bin/hook.c b/bin/hook.c index f65cac869..7a79bcab9 100644 --- a/bin/hook.c +++ b/bin/hook.c @@ -26,7 +26,7 @@ void hook_envs_set(struct container *c, void *d) void hook_envs_set_file(struct container *c, void *d) { struct env_file *ef = d; - envs_set(env_file_read(ef->path, ef->delim);, c->env_expand); + envs_set(env_file_read(ef->path, ef->delim), c->env_expand); } /* Unset the environment variables matching glob d. */ diff --git a/bin/json.c b/bin/json.c index 3826543b8..04ebcd618 100644 --- a/bin/json.c +++ b/bin/json.c @@ -14,6 +14,7 @@ #include "core.h" #include "json.h" +#include "mem.h" #include "misc.h" @@ -82,10 +83,10 @@ void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); /** Globals **/ -// List of CDI specs we’ve read. Yes it’s a global, but that lets us keep -// struct cdi_spec private to this file, which seemed like the right -// trade-off. It also seemed like “all the specs we know about” wasn’t -// something we needed multiple of. +/* List of CDI specs we’ve read. Yes it’s a global, but that lets us keep + struct cdi_spec private to this file, which seemed like the right + trade-off. It also seemed like “all the specs we know about” wasn’t + something we needed multiple of. */ struct cdi_spec *cdi_specs = NULL; /* Callback tables. In the struct, the callback’s second argument is “void *” @@ -139,7 +140,7 @@ char **array_strings_json_to_c(cJSON *jarry, size_t *ct) Tf (cJSON_IsArray(jarry), "JSON: expected array"); *ct = cJSON_GetArraySize(jarry); - carry = ch_malloc((*ct + 1) * sizeof(char *)); + carry = ch_malloc((*ct + 1) * sizeof(char *), true); carry[*ct] = NULL; i = 0; @@ -195,9 +196,9 @@ char *cdi_hook_to_string(const char *hook_name, char **args) args_str = ""; for (size_t i = 0; args[i] != NULL; i++) - args_str = cats(3, as_old, " ", args[i]); + args_str = cats(3, args_str, " ", args[i]); - return ch_asprintf("%s:%s", hook_name, args_str)); + return ch_asprintf("%s:%s", hook_name, args_str); } /* Read the CDI spec files we need. @@ -228,16 +229,13 @@ void cdi_init(struct cdi_config *cf) // Read CDI spec files in configured directories if neccessary. if (cf->devs_all_p || req_by_kind) for (int i = 0; cf->spec_dirs[i] != NULL; i++) { - int entry_ct; - struct dirent **des; - entry_ct = dir_ls(cf->spec_dirs[i], &des); - for (int j = 0; j < entry_ct; j++) { - if (!fnmatch("*.json", des[i]->d_name, 0)) { - char *path = path_join(cf->spec_dirs[i], des[i]->d_name); - struct cdi_spec *spec = cdi_read_maybe(cdi_specs, path); - if (spec != NULL && cdi_requested(cf, spec)) - list_append((void **)&cdi_specs, spec, sizeof(*spec)); - } + char **entries = dir_glob(cf->spec_dirs[i], "*.json"); + for (int j = 0; entries[j] != NULL; j++) { + struct cdi_spec *spec; + spec = cdi_read_maybe(cdi_specs, + path_join(cf->spec_dirs[i], entries[j])); + if (spec != NULL && cdi_requested(cf, spec)) + list_append((void **)&cdi_specs, spec, sizeof(*spec)); } } @@ -294,12 +292,14 @@ struct cdi_spec *cdi_read(const char *path) Tf (fp = fopen(path, "rb"), "CDI: can't open: %s", path); Zf (fstat(fileno(fp), &st), "CDI: can't stat: %s", path); for (size_t used = 0, avail = READ_SZ; true; avail += READ_SZ) { - text = ch_realloc(text, avail); - size_t read_ct = fread(text + used, 1, READ_SZ, fp); + size_t read_ct; + text = ch_realloc(text, avail, false); + read_ct = fread(text + used, 1, READ_SZ, fp); used += read_ct; if (read_ct < READ_SZ) { if (feof(fp)) { // EOF reached - text[used] = '\0'; // ensure string ended + T_ (used < avail); + text[used] = '\0'; // terminate string break; } Tf(0, "CDI: can't read: %s", path); @@ -311,8 +311,8 @@ struct cdi_spec *cdi_read(const char *path) Tf(tree != NULL, "CDI: JSON failed at byte %d: %s", parse_end - text, path); // Visit parse tree to build our struct. - spec = ch_malloc(sizeof(struct cdi_spec)); - spec->src_path = path; + spec = ch_malloc(sizeof(struct cdi_spec), true); + spec->src_path = (char *)path; // shouldn’t ever be written spec->src_dev = st.st_dev; spec->src_ino = st.st_ino; visit(cdiPD_root, tree, spec); diff --git a/bin/mem.c b/bin/mem.c index eca1689af..308358f2d 100644 --- a/bin/mem.c +++ b/bin/mem.c @@ -17,13 +17,13 @@ ------------------------------ Because we use a lot of zero-terminated data structures, it would be nice - for the allocation functions to return zeroed buffers. We also want to not - require libgc, i.e., we want to still be able to use malloc(3) and - realloc(3) under the hood. It’s easy to provide a zeroing - malloc(3)-workalike, but as far as I can tell, it’s impossible to do so for - realloc(3)-alike unless we either (1) maintain our own allocation size - tracking or (2) use highly non-portable code. Neither of these seemed worth - the effort and complexity. + for the allocation functions to just always return zeroed buffers. We also + want to not require libgc, i.e., we want to still be able to use malloc(3) + and realloc(3) under the hood. It’s easy to provide a zeroing + malloc(3)-workalike, and we do, but as far as I can tell, it’s impossible + to do so for realloc(3)-alike unless we either (1) maintain our own + allocation size tracking or (2) use highly non-portable code. Neither of + these seemed worth the effort and complexity. This is because, as it turns out, the length of an allocated buffer is a more complicated notion than it seems. A buffer has *two* different @@ -138,7 +138,7 @@ char *ch_asprintf(const char *fmt, ...) va_list ap; char *str; - va_start(ap); + va_start(ap, fmt); str = ch_vasprintf(fmt, ap); va_end(ap); @@ -246,13 +246,13 @@ void *ch_malloc(size_t size, bool pointerful) /* Like ch_malloc(), but same API as malloc(3). Prefer ch_malloc(). This is provided for libraries that let us hook memory allocation and de-allocation, e.g. cJSON. */ -void *ch_malloc(size_t size) +void *ch_malloc_pointerful(size_t size) { return ch_malloc(size, true); } /* Like ch_malloc(), but buffer contents are zeroed. */ -void ch_malloc_zeroed(size_t size, bool pointerful) +void *ch_malloc_zeroed(size_t size, bool pointerful) { void *buf = ch_malloc(size, pointerful); memset(buf, 0, size); @@ -284,9 +284,10 @@ void ch_memory_log(const char *when) char *line = NULL; char *s; ssize_t stack_len = 0, heap_len = 0, anon_len = 0; + ssize_t total_len, total_prev; #ifdef HAVE_GC struct GC_prof_stats_s ps; - ssize_t alloc, alloc_prev; + ssize_t used, used_prev; long time_collecting; #endif @@ -322,51 +323,56 @@ void ch_memory_log(const char *when) Z_ (fclose(fp)); // log the basics + total_len = stack_len + heap_len + anon_len; + total_prev = stack_prev + heap_prev + anon_prev; s = ch_asprintf("mem: %s: " - "stac %zdkB %+zd, heap %zdkB %+zd, anon %zdkB %+zd", + "%zdkB %+zd (stac %zdkB %+zd, heap %zdkB %+zd, anon %zdkB %+zd)", when, + kB(total_len), kB(total_len - total_prev), kB(stack_len), kB(stack_len - stack_prev), kB(heap_len), kB(heap_len - heap_prev), kB(anon_len), kB(anon_len - anon_prev)); + stack_prev = stack_len; + heap_prev = heap_len; + anon_prev = anon_len; DEBUG(s); #ifdef ENABLE_SYSLOG syslog(SYSLOG_PRI, "%s", s); #endif - stack_prev = stack_len; - heap_prev = heap_len; - anon_prev = anon_len; // log GC stuff #ifdef HAVE_GC GC_get_prof_stats(&ps, sizeof(ps)); time_collecting = GC_get_full_gc_total_time(); - alloc = ps.heapsize_full - ps.free_bytes_full; - alloc_prev = heapsize_prev - free_prev; - s = ch_asprintf("gc: " - "%s: %ld collections (%+ld) in %zdms (%+zd)", + // space + used = ps.heapsize_full - ps.free_bytes_full; + used_prev = heapsize_prev - free_prev; + s = ch_asprintf("gc: %s: " + "%zdkB %+zd (used %zdkB %+zd, free %zdkB %+zd, unmp %zdkB %+zd)", when, - ps.gc_no, ps.gc_no - gc_no_prev, - time_collecting, time_collecting - time_collecting_prev); + kB(ps.heapsize_full), kB(ps.heapsize_full - heapsize_prev), + kB(used), kB(used - used_prev), + kB(ps.free_bytes_full), kB(ps.free_bytes_full - free_prev), + kB(ps.unmapped_bytes), kB(ps.unmapped_bytes - unmapped_prev)); + heapsize_prev = ps.heapsize_full; + free_prev = ps.free_bytes_full; + unmapped_prev = ps.unmapped_bytes; DEBUG(s); #ifdef ENABLE_SYSLOG syslog(SYSLOG_PRI, "%s", s); #endif + // time + s = ch_asprintf("gc: " + "%s: %ld collections (%+ld) in %zdms (%+zd)", + when, + ps.gc_no, ps.gc_no - gc_no_prev, + time_collecting, time_collecting - time_collecting_prev); gc_no_prev = ps.gc_no; time_collecting_prev = time_collecting; - s = ch_asprintf("gc: %s: " - "totl %zdkB %+zd, allc %zdkB %+zd, free %zdkB %+zd, unmp %zdkB %+zd", - when, - kB(ps.heapsize_full), kB(ps.heapsize_full - heapsize_prev), - kB(alloc), kB(alloc - alloc_prev), - kB(ps.free_bytes_full), kB(ps.free_bytes_full - free_prev), - kB(ps.unmapped_bytes), kB(ps.unmapped_bytes - unmapped_prev)); DEBUG(s); #ifdef ENABLE_SYSLOG syslog(SYSLOG_PRI, "%s", s); #endif - heapsize_prev = ps.heapsize_full; - free_prev = ps.free_bytes_full; - unmapped_prev = ps.unmapped_bytes; #endif } @@ -424,7 +430,7 @@ char *ch_vasprintf(const char *fmt, va_list ap) { va_list ap2; int str_len; - char *str; + char *str; // = ch_malloc(1024, false); va_copy(ap2, ap); @@ -442,7 +448,7 @@ char *ch_vasprintf(const char *fmt, va_list ap) void garbageinate(const char *when) { #ifdef HAVE_GC - GC_collect_and_unmap(); + GC_gcollect_and_unmap(); ch_memory_log(when); #endif } diff --git a/bin/misc.c b/bin/misc.c index 31ab63bae..ee9d660c5 100644 --- a/bin/misc.c +++ b/bin/misc.c @@ -18,6 +18,7 @@ #include #include +#include "mem.h" #include "misc.h" @@ -113,7 +114,7 @@ char *argv_to_string(char **argv) char *s = NULL; for (size_t i = 0; argv[i] != NULL; i++) { - char *argv_, *x; + char *argv_; bool quote_p = false; // Max length is escape every char plus two quotes and terminating zero. @@ -162,7 +163,7 @@ char *argv_to_string(char **argv) } s = cats(5, s, i == 0 ? "" : " ", - quote_p ? "\"" : "", argv_, quote_p ? "\""); + quote_p ? "\"" : "", argv_, quote_p ? "\"" : ""); } return s; @@ -240,7 +241,7 @@ char *cats(size_t argc, ...) ret_len = 1; // for terminator for (int i = 0; i < argc; i++) { - char *arg = va_arg(ap); + char *arg = va_arg(ap, char *); if (arg == NULL) { argv[i] = ""; argv_lens[i] = 0; @@ -259,7 +260,7 @@ char *cats(size_t argc, ...) memcpy(next, argv[i], argv_lens[i]); next += argv_lens[i]; } - ret[ret_len] = '\0'; + ret[ret_len-1] = '\0'; return ret; } @@ -295,7 +296,7 @@ char **dir_glob(const char *path, const char *glob) } if (i >= alloc_ct - 1) { alloc_ct *= 2; - entries = ch_realloc(allot_ct * sizeof(char *), true); + entries = ch_realloc(entries, alloc_ct * sizeof(char *), true); } entries[i] = entry->d_name; i++; @@ -309,7 +310,7 @@ char **dir_glob(const char *path, const char *glob) /* Return the number of matches for glob in path. */ int dir_glob_count(const char *path, const char *glob) { - return list_count((void **)dir_glob(path, glob), sizeof(char *)); + return list_count(dir_glob(path, glob), sizeof(char *)); } /* Read the file listing environment variables at path, with records separated @@ -336,7 +337,7 @@ struct env_var *env_file_read(const char *path, int delim) struct env_var var; char *line; errno = 0; - line = ch_getdelim(fp, delim, fp); + line = ch_getdelim(fp, delim); if (line == NULL) // EOF break; if (line[strlen(line) - 1] == (char)delim) // rm delimiter if present @@ -444,7 +445,7 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) if (path == NULL) where = ch_strdup(line); else - where = ch_asprintf("%s:%zu", path, lineno)); + where = ch_asprintf("%s:%zu", path, lineno); // Split line into variable name and value. split(&name, &value, line, '='); @@ -491,7 +492,7 @@ void list_append(void **ar, void *new, size_t size) T_ (new != NULL); ct = list_count(*ar, size); - *ar = ch_realloc(*ar, (ct+2)*size, true)); // existing + new + terminator + *ar = ch_realloc(*ar, (ct+2)*size, true); // existing + new + terminator memcpy(*ar + ct*size, new, size); // append new (no overlap) memset(*ar + (ct+1)*size, 0, size); // set new terminator } @@ -533,7 +534,7 @@ void *list_new(size_t size, size_t ct) { void *list; T_ (size > 0); - T_ (list = ch_malloc_zeroed(ct+1, size, true)); + T_ (list = ch_malloc_zeroed((ct+1) * size, true)); return list; } @@ -660,12 +661,12 @@ void mkdir_overmount(const char *path, const char *scratch) char *parent, *path2, *over, *path_dst; char *orig_dir = ".orig"; // resisted calling this .weirdal int entry_ct; - struct dirent **entries; + char **entries; VERBOSE("making writeable via symlink ranch: %s", path); path2 = ch_strdup(path); parent = dirname(path2); - over = ch_asprintf("%s/%d", scratch, dir_ls_count(scratch) + 1); + over = ch_asprintf("%s/%d", scratch, dir_glob_count(scratch, "*") + 1); path_dst = path_join(over, orig_dir); // bind-mounts @@ -677,11 +678,12 @@ void mkdir_overmount(const char *path, const char *scratch) "can't bind-mount: %s- > %s", over, parent); // symlink ranch - entry_ct = dir_glob_count(path_dst, "*"); + entries = dir_glob(path_dst, "*"); + entry_ct = list_count(entries, sizeof(entries[0])); DEBUG("existing entries: %d", entry_ct); for (int i = 0; i < entry_ct; i++) { - char * src = path_join(parent, entries[i]->d_name); - char * dst = path_join(orig_dir, entries[i]->d_name); + char * src = path_join(parent, entries[i]); + char * dst = path_join(orig_dir, entries[i]); Zf (symlink(dst, src), "can't symlink: %s -> %s", src, dst); } @@ -724,7 +726,7 @@ void mkdirs(const char *base, const char *path, char **denylist, next = NULL; while (component != NULL) { next = path_join(nextc, component); // canonical except for last - TRACE("mkdirs: next: %s", next) + TRACE("mkdirs: next: %s", next); component = strtok_r(NULL, "/", &saveptr); // next NULL if current last if (path_exists(next, &sb, false)) { if (S_ISLNK(sb.st_mode)) { @@ -752,7 +754,7 @@ void mkdirs(const char *base, const char *path, char **denylist, Tf (0, "can't mkdir: %s", next); } nextc = next; // canonical b/c we just created last component as dir - TRACE("mkdirs: created: %s", nextc) + TRACE("mkdirs: created: %s", nextc); } } TRACE("mkdirs: done"); @@ -841,7 +843,7 @@ void msgv(enum log_level level, const char *file, int line, int errno_, text_full = ch_asprintf("%s[%d]: %s%s%s (%s:%d%s)", program_invocation_short_name, getpid(), level_prefix, text_formatted, errno_desc, - file, line, errno_code)); + file, line, errno_code); fprintf(stderr, "%s%s%s\n", colour, text_full, colour_reset); if (fflush(stderr)) abort(); // can’t print an error b/c already trying to do that @@ -932,7 +934,7 @@ void path_split(const char *path, char **dir, char **base) { if (dir != NULL) *dir = dirname(ch_strdup(path)); - if (base != NULL) { + if (base != NULL) *base = basename(ch_strdup(path)); } @@ -978,7 +980,7 @@ char *realpath_(const char *path, bool fail_ok) if (pathc == NULL) { if (fail_ok) { - pathc = ch_strdup(path)); + pathc = ch_strdup(path); } else { Tf (false, "can't canonicalize: %s", path); } diff --git a/bin/misc.h b/bin/misc.h index b6288f0ef..2b0e16cd7 100644 --- a/bin/misc.h +++ b/bin/misc.h @@ -74,12 +74,12 @@ #define Zf(x, ...) if (x) msg_fatal(__FILE__, __LINE__, errno, __VA_ARGS__) #define Ze(x, ...) if (x) msg_fatal(__FILE__, __LINE__, 0, __VA_ARGS__) -#define FATAL(...) msg_fatal( __FILE__, __LINE__, 0, __VA_ARGS__); -#define WARNING(...) msg(LL_WARNING, __FILE__, __LINE__, 0, __VA_ARGS__); -#define INFO(...) msg(LL_INFO, __FILE__, __LINE__, 0, __VA_ARGS__); -#define VERBOSE(...) msg(LL_VERBOSE, __FILE__, __LINE__, 0, __VA_ARGS__); -#define DEBUG(...) msg(LL_DEBUG, __FILE__, __LINE__, 0, __VA_ARGS__); -#define TRACE(...) msg(LL_TRACE, __FILE__, __LINE__, 0, __VA_ARGS__); +#define FATAL(...) msg_fatal( __FILE__, __LINE__, 0, __VA_ARGS__) +#define WARNING(...) msg(LL_WARNING, __FILE__, __LINE__, 0, __VA_ARGS__) +#define INFO(...) msg(LL_INFO, __FILE__, __LINE__, 0, __VA_ARGS__) +#define VERBOSE(...) msg(LL_VERBOSE, __FILE__, __LINE__, 0, __VA_ARGS__) +#define DEBUG(...) msg(LL_DEBUG, __FILE__, __LINE__, 0, __VA_ARGS__) +#define TRACE(...) msg(LL_TRACE, __FILE__, __LINE__, 0, __VA_ARGS__) /** Types **/ diff --git a/bin/seccomp.c b/bin/seccomp.c index 2d0fa9057..bf620a0e3 100644 --- a/bin/seccomp.c +++ b/bin/seccomp.c @@ -240,7 +240,7 @@ void hook_seccomp_install(struct container *c, void *d) // wrapper. T_ (ii == p.len); // next instruction now one past the end of the buffer Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p)); - DEBUG("seccomp: see contributor's guide to disassemble") + DEBUG("seccomp: see contributor's guide to disassemble"); // Test filter. This will fail if the kernel executes the call (because we // are not really privileged and the arguments are bogus) or succeed if diff --git a/configure.ac b/configure.ac index e9d24cdcd..121efd593 100644 --- a/configure.ac +++ b/configure.ac @@ -325,48 +325,30 @@ AS_IF([test -n "$inc_json"], # -L$lib_json added below AS_IF([test -n "$lib_libsquashfuse"], [ch_cflags="$ch_cflags -I$inc_libsquashfuse -L$lib_libsquashfuse"]) -AX_CHECK_COMPILE_FLAG([$ch_cflags], [ - CFLAGS="$CFLAGS $ch_cflags" -], [ - AC_MSG_ERROR([no suitable C99 compiler found]) -]) +AX_CHECK_COMPILE_FLAG([$ch_cflags], [], + [AC_MSG_ERROR([no suitable C99 compiler found])]) AS_IF([test "$CC" = icc], [AC_MSG_ERROR([icc not supported (see PR @%:@481)])]) ### ch-run required ########################################################## -# Only ch-run needs any kind of interesting library stuff; this variable holds -# the library arguments we need. This also requires us to use AC_CHECK_LIB -# instead of the (recommended by docs) AC_SEARCH_LIBS, because that adds -# things to LIBS, which we don’t want because it’s applied to all executables. -CH_RUN_LIBS= +# Note: We link both ch-run and ch-checkns with all the shared libraries, +# despite the latter using much less, depending on the compiler to omit +# libraries that aren’t actually used (gcc does this) or just not caring that +# extra libraries are linked. # argp_parse(3), which is included with glibc but not other libc’s, e.g. musl. -AC_MSG_CHECKING([for argp_parse in libc]) -AC_LINK_IFELSE([AC_LANG_SOURCE([[ - #include - - int main(void) - { - argp_parse(0, 1, NULL, 0, 0, 0); - return 0; - } - ]])], - [AC_MSG_RESULT([yes])], # built-in, no further action - [AC_MSG_RESULT([no]) # try external libargp - AC_CHECK_LIB( - [argp], [argp_parse], - [CH_RUN_LIBS="-largp $CH_RUN_LIBS"], - [AC_MSG_ERROR([argp_parse(3) not found; please report this bug])])]) +# In the latter case, we need an external libargp. +AC_SEARCH_LIBS(argp_parse, argp, [], + [AC_MSG_ERROR([argp_parse(3) not found; please report this bug])]) # pthreads; needed for “ch-run --join”. AX_PTHREAD -# POSIX IPC lives in librt. -AC_CHECK_LIB([rt], [shm_open], [CH_RUN_LIBS="-lrt $CH_RUN_LIBS"], [ - AC_MSG_ERROR([shm_open(3) not found]) -]) +# POSIX IPC sometimes lives in librt. +AC_SEARCH_LIBS(shm_open, rt, [], + [AC_MSG_ERROR([shm_open(3) not found; please report this bug])]) # User namespaces AC_MSG_CHECKING([if in chroot]) # https://unix.stackexchange.com/a/14346 @@ -488,13 +470,11 @@ AC_CHECK_DECL(FNM_EXTMATCH, # libgc. Note that we don’t try to ensure the header we find matches the # library we find. Hopefully that’s not a problem. AS_IF([test $want_gc = yes], [ - AC_CHECK_LIB(gc, GC_malloc, - [have_libgc=yes - AS_IF([test -n "$lib_gc"], - [CH_RUN_LIBS="-Wl,-rpath=$lib_gc $CH_RUN_LIBS"]) - CH_RUN_LIBS="-lgc $CH_RUN_LIBS"], - [have_libgc=no], - [$CH_RUN_LIBS]) + AC_SEARCH_LIBS(GC_malloc, gc, + [have_libgc=yes + AS_IF([test -n "$lib_gc"], + [ch_ldflags="-Wl,-rpath=$lib_gc $ch_ldflags"])], + [have_libgc=no]) AC_CHECK_HEADER([gc.h], [have_gc_h=yes], [have_gc_h=no]) @@ -509,13 +489,11 @@ AS_IF([test $need_gc = yes && test $have_gc = no], # cJSON. Also do not check this header matches the library we find. AS_IF([test $want_json = yes], [ - AC_CHECK_LIB(cjson, cJSON_ParseWithLength, - [have_libcjson=yes - AS_IF([test -n "$lib_json"], - [CH_RUN_LIBS="-Wl,-rpath=$lib_json $CH_RUN_LIBS"]) - CH_RUN_LIBS="-lcjson $CH_RUN_LIBS"], - [have_libcjson=no], - [$CH_RUN_LIBS]) + AC_SEARCH_LIBS(cJSON_ParseWithLength, cjson, + [have_libcjson=yes + AS_IF([test -n "$lib_json"], + [ch_ldflags="-Wl,-rpath=$lib_json $ch_ldflags"])], + [have_libcjson=no]) # The include file installs by default to “$PREFIX/include/cjson/cJSON.h”, # but --with-json-include shouldn’t require a “cjson” subdirectory and it # seemed impossible to document that concisely anyway. Thereforre, try both @@ -607,18 +585,30 @@ AS_IF([test $want_libsquashfuse = yes], [ [AC_MSG_ERROR([need pkg-config to find libfuse3; try --with-libsquashfuse=no or see issue @%:@1844])]) AS_IF([pkg-config --exists fuse3], [ have_libfuse3=yes - CFLAGS="$CFLAGS $(pkg-config --cflags fuse3)" + ch_cflags="$ch_cflags $(pkg-config --cflags fuse3)" + # add -lfuse3 to LIBS (we already know it is available) + AC_SEARCH_LIBS(fuse_session_new, fuse3, [], + [AC_MSG_ERROR([libfuse3 found but not found; please report this bug])]) # libsquashfuse? - AC_CHECK_LIB([squashfuse_ll], [sqfs_ll_mount], - [have_libsquashfuse_ll=yes], - [have_libsquashfuse_ll=no]) - # ll.h? + AC_SEARCH_LIBS(sqfs_ll_mount, squashfuse_ll, + [have_libsquashfuse_ll=yes], + [have_libsquashfuse_ll=no]) + # ll.h? This check is hairy because AC_CHECK_HEADERS tries to actually + # compile a program that includes the header, but that won’t work for ll.h + # without the -I for fuse3 we got from pkg-config. We’re also advised not + # to change $CFLAGS within configure.ac [1]. I couldn’t figure out a way + # to get the -I into AC_CHECK_HEADER without changing $CFLAGS, so I just + # put it back to follow the advice as best I could. + # [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.65/html_node/Preset-Output-Variables.html + cflags_old=$CFLAGS + CFLAGS="$ch_cflags $CFLAGS" AC_CHECK_HEADER([squashfuse/ll.h], [have_ll_h=yes], [have_ll_h=no], [#define SQFS_CONFIG_H #define FUSE_USE_VERSION 32 ]) # see comment in fuse.c regarding these defines + CFLAGS=$cflags_old ], [have_libfuse3=no]) ]) @@ -630,9 +620,7 @@ AS_IF([ test $want_libsquashfuse = yes \ && test $have_ll_h = yes], [have_libsquashfuse=yes AS_IF([test -n "$lib_libsquashfuse"], - [rpath_libsquashfuse=-Wl,-rpath=$lib_libsquashfuse], - [rpath_libsquashfuse=]) - CH_RUN_LIBS="-lsquashfuse_ll -lfuse3 $rpath_libsquashfuse $CH_RUN_LIBS"], + [ch_ldflags="-Wl,-rpath=$lib_libsquashfuse $ch_ldflags"])], [have_libsquashfuse=no]) AS_IF([ test $need_libsquashfuse = yes \ && test $have_libsquashfuse = no], @@ -888,9 +876,10 @@ AS_IF([test $enable_syslog = yes], [AC_DEFINE([ENABLE_SYSLOG], [1], [log to syslog])]) AM_CONDITIONAL([ENABLE_TEST], [test $enable_test = yes]) -AC_SUBST([CH_RUN_LIBS]) -AC_SUBST([PYTHON_SHEBANG]) -AC_SUBST([SPHINX]) +AC_SUBST(AM_CFLAGS, [$ch_cflags]) +AC_SUBST(AM_LDFLAGS, [$ch_ldflags]) +AC_SUBST(PYTHON_SHEBANG) +AC_SUBST(SPHINX) AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) @@ -1073,8 +1062,9 @@ Building Charliecloud required: C99 compiler ... ${CC} - \$CFLAGS ... ${CFLAGS} - ch-run(1) library args ... ${CH_RUN_LIBS} + \$CFLAGS ... ${ch_cflags} + \$LDFLAGS ... ${ch_ldflags} + library args ... ${LIBS} extended glob patterns in --unset-env: ${have_fnm_extmatch} diff --git a/doc/dev.rst b/doc/dev.rst index 380952983..4c5f85236 100644 --- a/doc/dev.rst +++ b/doc/dev.rst @@ -944,11 +944,14 @@ C code Memory management ~~~~~~~~~~~~~~~~~ -*TL;DR:* Charliecloud does not call :code:`free(3)`. +*TL;DR:* Charliecloud does not free any memory. You can enable garbage +collection with :code:`libgc` if you want, and this is the default, but it may +not be necessary, i.e. simply leaking all allocated memory could still be +smaller than the overhead of trying to clean up. *How-To:* (1) Use Charliecloud wrappers for all library functions that allocate memory, e.g. :code:`ch_malloc()` instead of :code:`malloc(3)`. -Importantly, this includes things like :code:`strdup(3)` makeand +Importantly, this includes things like :code:`strdup(3)` and :code:`asprintf(3)`. (2) Don’t call :code:`free(3)` or any other library functions that free memory. @@ -978,9 +981,10 @@ can either: 1. YOLO, i.e. simply never free anything, i.e. leak like a sieve. But Charliecloud is still a small program and it’s unlikely to be an actual - problem. Quick-and-dirty tests show a main :code:`ch-run` process using - **FIXME** MiB just before it executes the user program, and the SquashFUSE - process **FIXME** MiB upon exit. + problem. Our quick-and-dirty tests with a small “hello world” Alpine image + running :code:`true(1)` show a main :code:`ch-run` process using 350 KiB + just before it executes the user program, and the SquashFUSE process the + same just before forking and 1,600 KiB upon exit. 2. Link with :code:`libgc`, i.e. the `Boehm-Demers-Weiser `_ conservative garbage collector. The idea is @@ -988,8 +992,12 @@ can either: for integers that *look* like pointers and assumes they *are* pointers. Apparently it `works quite well `_ and can even be faster than explicit memory management in some cases. The - quick-and-dirty tests show **FIXME** MiB by the main process, and the - SquashFUSE process **FIXME** just after forking and **FIXME** upon exit. + quick-and-dirty tests show 900 KiB by the main process, and the SquashFUSE + process the same just before forking (after an explicit garbage collection) + and 2,200 KiB upon exit. + +:code:`ch-run` logs memory usage to syslog, and also stderr with :code:`-vv`, +so you can analyze your specific situation. :code:`const` ~~~~~~~~~~~~~ From a3e454df2fa1bedf4ee28e21987f089732edda2a Mon Sep 17 00:00:00 2001 From: Reid Priedhorsky Date: Tue, 1 Oct 2024 11:21:52 -0600 Subject: [PATCH 29/29] make CDI EV hooks work [skip ci] --- bin/ch-run.c | 4 ++-- bin/json.c | 25 ++++++++++++++----------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bin/ch-run.c b/bin/ch-run.c index f5c27a2ed..c8833b0b6 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -433,12 +433,12 @@ void hooks_env_install(struct args *args) case ENV_CDI_DEV: name = "env-set-cdi"; f = hook_envs_set; - //d = cdi_envs_get(arg); + d = cdi_envs_get(arg); break; case ENV_CDI_ALL: name = "env-set-cdi-all"; f = hook_envs_set; - //d = cdi_envs_get(NULL); + d = cdi_envs_get(NULL); case ENV_END: T_ (false); // unreachable break; diff --git a/bin/json.c b/bin/json.c index 04ebcd618..adf6d5182 100644 --- a/bin/json.c +++ b/bin/json.c @@ -160,20 +160,23 @@ bool cdi_devid_kind_p(const char *devid) return (devid[0] != '.' && devid[0] != '/'); } -/* Return a list of environment variables to be set for device kind kind, or - if kind is NULL, all known devices. Both the list and the buffers within - are newly allocated; the caller must free the list with envs_free(). */ +/* Return a list of environment variables to be set for device devid, which + can be either a device kind or a path, or if devid is NULL, all known + devices. */ struct env_var *cdi_envs_get(const char *devid) { - //struct env_var *vars; - - // count variables so we can do just one allocation - //for () - - // set up the list + struct env_var *vars = list_new(sizeof(struct env_var), 0); + + for (int i = 0; cdi_specs[i].kind != NULL; i++) { + // Compare devid with both kind and path without checking what it is + // because it seemed the odds of false positive low enough. + if ( devid == NULL + || !strcmp(devid, cdi_specs[i].kind) + || !strcmp(devid, cdi_specs[i].src_path)) + list_append((void **)&vars, cdi_specs[i].envs, sizeof(vars[0])); + } - //return vars; - return NULL; + return vars; } void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args)