diff --git a/.gitignore b/.gitignore index cee56d28d..4af822fca 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,7 @@ a.out /charliecloud-*/ # debugging crap +core /build-cache.gv /build-cache.pdf diff --git a/bin/Makefile.am b/bin/Makefile.am index 0b2b9a77e..d96ef4502 100644 --- a/bin/Makefile.am +++ b/bin/Makefile.am @@ -6,16 +6,18 @@ bin_PROGRAMS = ch-checkns ch-run -ch_checkns_SOURCES = ch-checkns.c ch_misc.h ch_misc.c +ch_checkns_SOURCES = ch-checkns.c mem.h mem.c misc.h misc.c -ch_run_SOURCES = ch-run.c ch_core.h ch_core.c ch_misc.h ch_misc.c +ch_run_SOURCES = ch-run.c core.h core.c hook.h hook.c mem.h mem.c misc.h misc.c +if HAVE_JSON +ch_run_SOURCES += json.h json.c +endif if HAVE_LIBSQUASHFUSE -ch_run_SOURCES += ch_fuse.h ch_fuse.c +ch_run_SOURCES += fuse.h fuse.c +endif +if HAVE_SECCOMP +ch_run_SOURCES += seccomp.h seccomp.c endif - -# additional build flags for ch-run -ch_run_CFLAGS = $(PTHREAD_CFLAGS) -ch_run_LDADD = $(CH_RUN_LIBS) ## Shell scripts - distributed as-is diff --git a/bin/ch-checkns.c b/bin/ch-checkns.c index 10f26969a..6bc18134e 100644 --- a/bin/ch-checkns.c +++ b/bin/ch-checkns.c @@ -49,7 +49,7 @@ #include #include "config.h" -#include "ch_misc.h" +#include "misc.h" const char usage[] = "\ diff --git a/bin/ch-run.c b/bin/ch-run.c index 774f02ed9..c8833b0b6 100644 --- a/bin/ch-run.c +++ b/bin/ch-run.c @@ -4,6 +4,8 @@ are modest and the program is short-lived. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -12,9 +14,54 @@ #include #include -#include "config.h" -#include "ch_core.h" -#include "ch_misc.h" +#include "core.h" +#include "hook.h" +#ifdef HAVE_JSON +#include "json.h" +#endif +#include "mem.h" +#include "misc.h" +#ifdef HAVE_SECCOMP +#include "seccomp.h" +#endif + + +/** Types **/ + +enum env_option_type { + ENV_END = 0, // list terminator sentinel + ENV_SET, // --set-env + ENV_SET0, // --set-env0 + ENV_UNSET, // --unset-env + ENV_CDI_DEV, // --device (specific device) + ENV_CDI_ALL, // --devices (all known devices) +}; + +struct env_option { + enum env_option_type opt; + char *arg; +}; + +struct args { + struct container c; +#ifdef HAVE_JSON + struct cdi_config cdi; +#endif + struct env_option *env_options; + enum log_color_when log_color; + enum log_test log_test; + char *initial_dir; +#ifdef HAVE_SECCOMP + bool seccomp_p; +#endif + char *storage_dir; + bool unsafe; +}; + +struct log_color_synonym { + char *name; + enum log_color_when color; +}; /** Constants and macros **/ @@ -30,6 +77,20 @@ char *JOIN_TAG_ENV[] = { "SLURM_STEP_ID", /* Default overlaid tmpfs size. */ char *WRITE_FAKE_DEFAULT = "12%"; +/* Log color WHEN synonyms. Note that no argument (i.e., bare --color) is + handled separately. */ +struct log_color_synonym log_color_synonyms[] = { + { "auto", LL_COLOR_AUTO }, + { "tty", LL_COLOR_AUTO }, + { "if-tty", LL_COLOR_AUTO }, + { "yes", LL_COLOR_YES }, + { "always", LL_COLOR_YES }, + { "force", LL_COLOR_YES }, + { "no", LL_COLOR_NO }, + { "never", LL_COLOR_NO }, + { "none", LL_COLOR_NO }, + { NULL, LL_COLOR_NULL } }; + /** Command line options **/ @@ -49,9 +110,20 @@ const char args_doc[] = "IMAGE -- COMMAND [ARG...]"; /* Note: Long option numbers, once issued, are permanent; i.e., if you remove one, don’t re-number the others. */ const struct argp_option options[] = { + { "abort-fatal", -21, 0, 0, + "exit abnormally on error, maybe dumping core" }, { "bind", 'b', "SRC[:DST]", 0, "mount SRC at guest DST (default: same as SRC)"}, { "cd", 'c', "DIR", 0, "initial working directory in container"}, +#ifdef HAVE_JSON + { "cdi-dirs", -19, "DIRS", 0, "director(y|ies) containing CDI specs" }, +#endif + { "color", -20, "WHEN", OPTION_ARG_OPTIONAL, + "specify when to use colored logging" }, +#ifdef HAVE_JSON + { "device", -18, "DEV", 0, "inject CDI device(s) DEV (repeatable)" }, + { "devices", 'd', 0, 0, "inject default CDI devices" }, +#endif { "env-no-expand", -10, 0, 0, "don't expand $ in --set-env input"}, { "feature", -11, "FEAT", 0, "exit successfully if FEAT is enabled" }, { "gid", 'g', "GID", 0, "run as GID within container" }, @@ -87,33 +159,19 @@ const struct argp_option options[] = { }; -/** Types **/ - -struct args { - struct container c; - struct env_delta *env_deltas; - char *initial_dir; -#ifdef HAVE_SECCOMP - bool seccomp_p; -#endif - char *storage_dir; - bool unsafe; -}; - - /** Function prototypes **/ -void fix_environment(struct args *args); bool get_first_env(char **array, char **name, char **value); +void hooks_env_install(struct args *args); void img_directory_verify(const char *img_path, const struct args *args); int join_ct(int cli_ct); char *join_tag(char *cli_tag); +void parse_env(struct env_option **opts, enum env_option_type opt, char *arg); int parse_int(char *s, bool extra_ok, char *error_tag); static error_t parse_opt(int key, char *arg, struct argp_state *state); -void parse_set_env(struct args *args, char *arg, int delim); void privs_verify_invoking(); char *storage_default(void); -extern void warnings_reprint(void); +void write_fake_enable(struct args *args, char *overlay_size); /** Global variables **/ @@ -138,12 +196,15 @@ int main(int argc, char *argv[]) T_ (warnings != MAP_FAILED); privs_verify_invoking(); + ch_memory_init(); + // note: exit functions not called on fatal error if --abort-on-fatal + Z_ (atexit(ch_memory_exit)); Z_ (atexit(warnings_reprint)); #ifdef ENABLE_SYSLOG - syslog(LOG_USER|LOG_INFO, "uid=%u args=%d: %s", getuid(), argc, - argv_to_string(argv)); + syslog(SYSLOG_PRI, "uid=%u args=%d: %s", + getuid(), argc, argv_to_string(argv)); #endif username = getenv("USER"); @@ -151,29 +212,41 @@ int main(int argc, char *argv[]) verbose = LL_INFO; // in ch_misc.c args = (struct args){ - .c = (struct container){ .binds = list_new(sizeof(struct bind), 0), - .container_gid = getegid(), - .container_uid = geteuid(), - .env_expand = true, - .host_home = NULL, - .img_ref = NULL, - .newroot = NULL, - .join = false, - .join_ct = 0, - .join_pid = 0, - .join_tag = NULL, - .overlay_size = NULL, - .private_passwd = false, - .private_tmp = false, - .type = IMG_NONE, - .writable = false }, - .env_deltas = list_new(sizeof(struct env_delta), 0), - .initial_dir = NULL, -#ifdef HAVE_SECCOMP - .seccomp_p = false, + .c = (struct container){ + .binds = list_new(sizeof(struct bind), 0), + .container_gid = getegid(), + .container_uid = geteuid(), + .env_expand = true, + .hooks_prestart = list_new(sizeof(struct hook), 0), + .host_home = NULL, + .img_ref = NULL, + .ldconfigs = list_new(sizeof(char *), 0), + .newroot = NULL, + .join = false, + .join_ct = 0, + .join_pid = 0, + .join_tag = NULL, + .overlay_size = NULL, + .private_passwd = false, + .private_tmp = false, + .type = IMG_NONE, + .writable = false + }, +#ifdef HAVE_JSON + .cdi = (struct cdi_config){ + .spec_dirs = list_new_strings(':', env_get("CH_RUN_CDI_DIRS", + "/etc/cdi:/var/run/cdi")), + .devs_all_p = false, + .devids = list_new(sizeof(char *), 0), + }, #endif + .env_options = list_new(sizeof(struct env_option), 0), + .initial_dir = NULL, + .log_color = LL_COLOR_AUTO, + .log_test = LL_TEST_NONE, .storage_dir = storage_default(), - .unsafe = false }; + .unsafe = false + }; /* I couldn't find a way to set argp help defaults other than this environment variable. Kludge sets/unsets only if not already set. */ @@ -187,8 +260,15 @@ int main(int argc, char *argv[]) if (!argp_help_fmt_set) Z_ (unsetenv("ARGP_HELP_FMT")); + logging_init(args.log_color, args.log_test); + ch_memory_log("init"); +#ifdef HAVE_JSON + json_init(); +#endif + + if (arg_next >= argc - 1) { - printf("usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); + fprintf(stderr, "usage: ch-run [OPTION...] IMAGE -- COMMAND [ARG...]\n"); FATAL("IMAGE and/or COMMAND not specified"); } args.c.img_ref = argv[arg_next++]; @@ -223,15 +303,12 @@ int main(int argc, char *argv[]) args.c.join_tag = join_tag(args.c.join_tag); } - if (getenv("TMPDIR") != NULL) - host_tmp = getenv("TMPDIR"); - else - host_tmp = "/tmp"; - c_argv = list_new(sizeof(char *), argc - arg_next); for (int i = 0; i < argc - arg_next; i++) c_argv[i] = argv[i + arg_next]; + host_tmp = env_get("TMPDIR", "/tmp"); // global in misc.c + VERBOSE("verbosity: %d", verbose); VERBOSE("image: %s", args.c.img_ref); VERBOSE("storage: %s", args.storage_dir); @@ -240,91 +317,137 @@ int main(int argc, char *argv[]) VERBOSE("container gid: %u", args.c.container_gid); VERBOSE("join: %d %d %s %d", args.c.join, args.c.join_ct, args.c.join_tag, args.c.join_pid); + VERBOSE("host $TMPDIR: %s", host_tmp); VERBOSE("private /tmp: %d", args.c.private_tmp); #ifdef HAVE_SECCOMP - VERBOSE("seccomp: %d", args.seccomp_p); + VERBOSE("seccomp: %s", bool_to_string(args.seccomp_p)); #endif - VERBOSE("unsafe: %d", args.unsafe); + VERBOSE("unsafe: %s", bool_to_string(args.unsafe)); - containerize(&args.c); - fix_environment(&args); -#ifdef HAVE_SECCOMP - if (args.seccomp_p) - seccomp_install(); +#ifdef HAVE_JSON + cdi_init(&args.cdi); #endif - run_user_command(c_argv, args.initial_dir); // should never return + hooks_env_install(&args); + //cdi_hook_ldconfig_install(&args.c.hook_prestart, &args.cdi); + + containerize(&args.c); + run_user_command(c_argv, args.initial_dir); // should never return exit(EXIT_FAILURE); } /** Supporting functions **/ -/* Adjust environment variables. Call once containerized, i.e., already - pivoted into new root. */ -void fix_environment(struct args *args) +/* Find the first environment variable in array that is set; put its name in + *name and its value in *value, and return true. If none are set, return + false, and *name and *value are undefined. */ +bool get_first_env(char **array, char **name, char **value) { - char *old_value, *new_value; + for (int i = 0; array[i] != NULL; i++) { + *name = array[i]; + *value = getenv(*name); + if (*value != NULL) + return true; + } + + return false; +} + +/* Set the default environment variables that come before the user-specified + environment changes. d must be NULL. */ +void hook_envs_def_first(struct container *c, void *d) +{ + char *vold; + T_ (d == NULL); // $HOME: If --home, set to “/home/$USER”. - if (args->c.host_home) { - Z_ (setenv("HOME", cat("/home/", username), 1)); - } else if (path_exists("/root", NULL, true)) { - Z_ (setenv("HOME", "/root", 1)); - } else - Z_ (setenv("HOME", "/", 1)); + if (c->host_home) + env_set("HOME", cat("/home/", username), false); + else if (path_exists("/root", NULL, true)) + env_set("HOME", "/root", false); + else + env_set("HOME", "/", false); // $PATH: Append /bin if not already present. - old_value = getenv("PATH"); - if (old_value == NULL) { + vold = getenv("PATH"); + if (vold == NULL) WARNING("$PATH not set"); - } else if ( strstr(old_value, "/bin") != old_value - && !strstr(old_value, ":/bin")) { - T_ (1 <= asprintf(&new_value, "%s:/bin", old_value)); - Z_ (setenv("PATH", new_value, 1)); - VERBOSE("new $PATH: %s", new_value); - } + else if (strstr(vold, "/bin") != vold && !strstr(vold, ":/bin")) + env_set("PATH", cat(vold, ":/bin"), false); // $TMPDIR: Unset. Z_ (unsetenv("TMPDIR")); +} - // --set-env and --unset-env. - for (size_t i = 0; args->env_deltas[i].action != ENV_END; i++) { - struct env_delta ed = args->env_deltas[i]; - switch (ed.action) { - case ENV_END: - Te (false, "unreachable code reached"); +/* Set the default environment variables that come after the user-specified + changes. d must be NULL. */ +void hook_envs_def_last(struct container *c, void *d) +{ + T_ (d == NULL); + env_set("CH_RUNNING", "Weird Al Yankovic", false); +} + +/* Install pre-start hooks for environment variable changes. */ +void hooks_env_install(struct args *args) +{ + hook_add(&args->c.hooks_prestart, HOOK_DUP_FAIL, + "env-def-first", hook_envs_def_first, NULL); + + for (int i = 0; args->env_options[i].opt != ENV_END; i++) { + char *name; + hookf_t *f; + void *d; + enum env_option_type opt = args->env_options[i].opt; + char *arg = args->env_options[i].arg; + + switch (opt) { + case ENV_SET: + case ENV_SET0: + int delim = ENV_SET ? '\n' : '\0'; + if (args == NULL) { // guest path; defer file read + struct env_file *ef; + name = "env-set-gfile"; + f = hook_envs_set_file; + ef = ch_malloc(sizeof(struct env_file), true); + ef->path = arg; + ef->delim = delim; + ef->expand = args->c.env_expand; + d = ef; + } else { + f = hook_envs_set; + if (strchr(arg, '=') == NULL) { // host path; read file now + name = "env-set-hfile"; + d = env_file_read(arg, delim); + } else { // direct set + name = "env-set-direct"; + d = list_new(sizeof(struct env_var), 1); + ((struct env_var *)d)[0] = env_var_parse(arg, NULL, 0); + } + } break; - case ENV_SET_DEFAULT: - ed.arg.vars = env_file_read("/ch/environment", ed.arg.delim); - // fall through - case ENV_SET_VARS: - for (size_t j = 0; ed.arg.vars[j].name != NULL; j++) - env_set(ed.arg.vars[j].name, ed.arg.vars[j].value, - args->c.env_expand); + case ENV_UNSET: + name = "env-unset"; + f = hook_envs_unset; + d = arg; break; - case ENV_UNSET_GLOB: - env_unset(ed.arg.glob); + case ENV_CDI_DEV: + name = "env-set-cdi"; + f = hook_envs_set; + d = cdi_envs_get(arg); + break; + case ENV_CDI_ALL: + name = "env-set-cdi-all"; + f = hook_envs_set; + d = cdi_envs_get(NULL); + case ENV_END: + T_ (false); // unreachable break; } + hook_add(&args->c.hooks_prestart, HOOK_DUP_OK, name, f, d); } - // $CH_RUNNING is not affected by --unset-env or --set-env. - Z_ (setenv("CH_RUNNING", "Weird Al Yankovic", 1)); -} - -/* Find the first environment variable in array that is set; put its name in - *name and its value in *value, and return true. If none are set, return - false, and *name and *value are undefined. */ -bool get_first_env(char **array, char **name, char **value) -{ - for (int i = 0; array[i] != NULL; i++) { - *name = array[i]; - *value = getenv(*name); - if (*value != NULL) - return true; - } - - return false; + hook_add(&args->c.hooks_prestart, HOOK_DUP_FAIL, + "env-def-last", hook_envs_def_last, NULL); } /* Validate that it’s OK to run the IMG_DIRECTORY format image at path; if @@ -380,7 +503,7 @@ char *join_tag(char *cli_tag) } VERBOSE("join: peer group tag from getppid(2)"); - T_ (1 <= asprintf(&tag, "%d", getppid())); + tag = ch_asprintf("%d", getppid()); end: Te(tag[0] != '\0', "join: peer group tag cannot be empty string"); @@ -425,15 +548,11 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) args->c.join_pid = parse_int(arg, false, "--join-pid"); break; case -6: // --set-env - parse_set_env(args, arg, '\n'); - break; - case -7: { // --unset-env - struct env_delta ed; - Te (strlen(arg) > 0, "--unset-env: GLOB must have non-zero length"); - ed.action = ENV_UNSET_GLOB; - ed.arg.glob = arg; - list_append((void **)&(args->env_deltas), &ed, sizeof(ed)); - } break; + parse_env(&args->env_options, ENV_SET, arg); + break; + case -7: // --unset-env + parse_env(&args->env_options, ENV_UNSET, arg); + break; case -9: // --no-passwd args->c.private_passwd = true; break; @@ -487,11 +606,12 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; #ifdef HAVE_SECCOMP case -14: // --seccomp - args->seccomp_p = true; + hook_add(&args->c.hooks_prestart, HOOK_DUP_SKIP, + "seccomp", hook_seccomp_install, NULL); break; #endif case -15: // --set-env0 - parse_set_env(args, arg, '\0'); + parse_env(&args->env_options, ENV_SET0, arg); break; case -16: // --warnings for (int i = 1; i <= parse_int(arg, false, "--warnings"); i++) @@ -500,36 +620,77 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) break; case -17: // --test if (!strcmp(arg, "log")) - test_logging(false); + args->log_test = LL_TEST_YES; else if (!strcmp(arg, "log-fail")) - test_logging(true); + args->log_test = LL_TEST_FATAL; else FATAL("invalid --test argument: %s; see source code", arg); break; +#ifdef HAVE_JSON + case -18: { // --device + struct env_option ope; + Te (strlen(arg) > 0, "--device: DEV must be non-empty"); + write_fake_enable(args, NULL); + list_append((void **)&args->cdi.devids, &arg, sizeof(arg)); + ope.opt = ENV_CDI_DEV; + ope.arg = arg; + list_append((void **)&args->env_options, &ope, sizeof(ope)); + } break; + case -19: // --cdi-dirs + Te (strlen(arg) > 0, "--cdi-dirs: PATHS must be non-empty"); + args->cdi.spec_dirs = list_new_strings(':', arg); + break; +#endif + case -20: // --color + if (arg == NULL) + args->log_color = LL_COLOR_AUTO; + args->log_color = LL_COLOR_NULL; + for (int i = 0; true; i++) { + if (log_color_synonyms[i].name == NULL) + break; + if (!strcmp(arg, log_color_synonyms[i].name)) { + args->log_color = log_color_synonyms[i].color; + break; + } + } + Tf (args->log_color != LL_COLOR_NULL, "--color: invalid arg: %s", arg); + break; + case -21: // --abort-fatal + abort_fatal = true; // in misc.c + break; case 'b': { // --bind - char *src, *dst; - for (i = 0; args->c.binds[i].src != NULL; i++) // count existing binds - ; - T_ (args->c.binds = realloc(args->c.binds, - (i+2) * sizeof(struct bind))); - args->c.binds[i+1].src = NULL; // terminating zero - args->c.binds[i].dep = BD_MAKE_DST; - // source - src = strsep(&arg, ":"); - T_ (src != NULL); - Te (src[0] != 0, "--bind: no source provided"); - args->c.binds[i].src = src; - // destination - dst = arg ? arg : src; - Te (dst[0] != 0, "--bind: no destination provided"); - Te (strcmp(dst, "/"), "--bind: destination can't be /"); - Te (dst[0] == '/', "--bind: destination must be absolute"); - args->c.binds[i].dst = dst; + char *src, *dst; + i = list_count(args->c.binds, sizeof(args->c.binds[0])); + args->c.binds = ch_realloc(args->c.binds, (i+2) * sizeof(struct bind), + true); + memset(&args->c.binds[i+1], 0, sizeof(args->c.binds[0])); // terminate + args->c.binds[i].dep = BD_MAKE_DST; + // source + src = strsep(&arg, ":"); + T_ (src != NULL); + Te (src[0] != 0, "--bind: no source provided"); + args->c.binds[i].src = src; + // destination + dst = arg ? arg : src; + Te (dst[0] != 0, "--bind: no destination provided"); + Te (strcmp(dst, "/"), "--bind: destination can't be /"); + Te (dst[0] == '/', "--bind: destination must be absolute"); + args->c.binds[i].dst = dst; } break; case 'c': // --cd args->initial_dir = arg; break; +#ifdef HAVE_JSON + case 'd': { // --devices + // Can’t add the devices here b/c we don’t know the CDI spec dirs yet. + struct env_option ope; + args->cdi.devs_all_p = true; + ope.opt = ENV_CDI_ALL; + ope.arg = NULL; + list_append((void **)&args->env_options, &ope, sizeof(ope)); + } break; +#endif case 'g': // --gid i = parse_int(arg, false, "--gid"); Te (i >= 0, "--gid: must be non-negative"); @@ -573,7 +734,7 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) args->c.writable = true; break; case 'W': // --write-fake - args->c.overlay_size = arg != NULL ? arg : WRITE_FAKE_DEFAULT; + write_fake_enable(args, arg); break; case ARGP_KEY_NO_ARGS: argp_state_help(state, stderr, ( ARGP_HELP_SHORT_USAGE @@ -583,31 +744,20 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) exit(EXIT_FAILURE); default: return ARGP_ERR_UNKNOWN; - }; + } return 0; } -void parse_set_env(struct args *args, char *arg, int delim) +void parse_env(struct env_option **opts, enum env_option_type opt, char *arg) { - struct env_delta ed; - - if (arg == NULL) { - ed.action = ENV_SET_DEFAULT; - ed.arg.delim = delim; - } else { - ed.action = ENV_SET_VARS; - if (strchr(arg, '=') == NULL) - ed.arg.vars = env_file_read(arg, delim); - else { - ed.arg.vars = list_new(sizeof(struct env_var), 1); - ed.arg.vars[0] = env_var_parse(arg, NULL, 0); - } - } - list_append((void **)&(args->env_deltas), &ed, sizeof(ed)); + struct env_option eo = (struct env_option){ .opt = opt, + .arg = arg }; + Te (arg == NULL || strlen(arg) > 0, + "environment options: argument must have non-zero length"); + list_append((void **)opts, &eo, sizeof(eo)); } - /* Validate that the UIDs and GIDs are appropriate for program start, and abort if not. @@ -636,13 +786,28 @@ void privs_verify_invoking() T_ (euid == ruid && euid == suid); // no setuid or funny business } -/* Return path to the storage directory, if -s is not specified. */ +/* Return default path to the storage directory. */ char *storage_default(void) { char *storage = getenv("CH_IMAGE_STORAGE"); if (storage == NULL) - T_ (1 <= asprintf(&storage, "/var/tmp/%s.ch", username)); + storage = ch_asprintf("/var/tmp/%s.ch", username); return storage; } + +/* Enable the overlay if not already enabled. */ +void write_fake_enable(struct args *args, char *overlay_size) +{ + if (overlay_size != NULL) { + // new overlay size specified: use it regardless of previous enablement + args->c.overlay_size = overlay_size; + } else if (args->c.overlay_size == NULL) { + // no new size, not yet enabled: enable with default size + args->c.overlay_size = WRITE_FAKE_DEFAULT; + } else { + // no new size, already enabled: keep existing size, nothing to do + T_ (args->c.overlay_size != NULL); + } +} diff --git a/bin/ch_core.c b/bin/core.c similarity index 57% rename from bin/ch_core.c rename to bin/core.c index 3850dbfa2..41c01fe50 100644 --- a/bin/ch_core.c +++ b/bin/core.c @@ -3,22 +3,12 @@ #define _GNU_SOURCE #include "config.h" -#include #include -#include -#ifdef HAVE_SECCOMP -#include -#include -#include -#endif #include #include #include +#include #include -#ifdef HAVE_SECCOMP -#include -#include -#endif #include #include #include @@ -29,10 +19,11 @@ #include #include -#include "ch_misc.h" -#include "ch_core.h" +#include "mem.h" +#include "misc.h" +#include "core.h" #ifdef HAVE_LIBSQUASHFUSE -#include "ch_fuse.h" +#include "fuse.h" #endif @@ -88,92 +79,6 @@ struct bind BINDS_DEFAULT[] = { { 0 } }; -/* Special values for seccomp tables. These must be negative to avoid clashing - with real syscall numbers (note zero is often a valid syscal number). */ -#define NR_NON -1 // syscall does not exist on architecture -#define NR_END -2 // end of table - -/* Architectures that we support for seccomp. Order matches the - corresponding table below. - - Note: On some distros (e.g., CentOS 7), some of the architecture numbers - are missing. The workaround is to use the numbers I have on Debian - Bullseye. The reason I (Reid) feel moderately comfortable doing this is how - militant Linux is about not changing the userspace API. */ -#ifdef HAVE_SECCOMP -#ifndef AUDIT_ARCH_AARCH64 -#define AUDIT_ARCH_AARCH64 0xC00000B7u // undeclared on CentOS 7 -#undef AUDIT_ARCH_ARM // uses undeclared EM_ARM on CentOS 7 -#define AUDIT_ARCH_ARM 0x40000028u -#endif -int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64 - AUDIT_ARCH_ARM, // arm32 - AUDIT_ARCH_I386, // x86 (32-bit) - AUDIT_ARCH_PPC64LE, // PPC - AUDIT_ARCH_S390X, // s390x - AUDIT_ARCH_X86_64, // x86-64 - NR_END }; -#endif - -/* System call numbers that we fake with seccomp (by doing nothing and - returning success). Some processors can execute multiple architectures - (e.g., 64-bit Intel CPUs can run both x64-64 and x86 code), and a process’ - architecture can even change (if you execve(2) binary of different - architecture), so we can’t just use the build host’s architecture. - - I haven’t figured out how to gather these system call numbers - automatically, so they are compiled from [1, 2, 3]. See also [4] for a more - general reference. - - NOTE: The total number of faked syscalls (i.e., non-zero entries below) - must be somewhat less than 256. I haven’t computed the exact limit. There - will be an assertion failure at runtime if this is exceeded. - - WARNING: Keep this list consistent with the ch-image(1) man page! - - [1]: https://chromium.googlesource.com/chromiumos/docs/+/HEAD/constants/syscalls.md#Cross_arch-Numbers - [2]: https://github.com/strace/strace/blob/v4.26/linux/powerpc64/syscallent.h - [3]: https://github.com/strace/strace/blob/v6.6/src/linux/s390x/syscallent.h - [4]: https://unix.stackexchange.com/questions/421750 */ -#ifdef HAVE_SECCOMP -int FAKE_SYSCALL_NRS[][6] = { - // arm64 arm32 x86 PPC64 s390x x86-64 - // ------ ------ ------ ------ ------ ------ - { 91, 185, 185, 184, 185, 126 }, // capset - { NR_NON, 182, 182, 181, 212, 92 }, // chown - { NR_NON, 212, 212, NR_NON, NR_NON, NR_NON }, // chown32 - { 55, 95, 95, 95, 207, 93 }, // fchown - { NR_NON, 207, 207, NR_NON, NR_NON, NR_NON }, // fchown32 - { 54, 325, 298, 289, 291, 260 }, // fchownat - { NR_NON, 16, 16, 16, 198, 94 }, // lchown - { NR_NON, 198, 198, NR_NON, NR_NON, NR_NON }, // lchown32 - { 104, 347, 283, 268, 277, 246 }, // kexec_load - { 152, 139, 139, 139, 216, 123 }, // setfsgid - { NR_NON, 216, 216, NR_NON, NR_NON, NR_NON }, // setfsgid32 - { 151, 138, 138, 138, 215, 122 }, // setfsuid - { NR_NON, 215, 215, NR_NON, NR_NON, NR_NON }, // setfsuid32 - { 144, 46, 46, 46, 214, 106 }, // setgid - { NR_NON, 214, 214, NR_NON, NR_NON, NR_NON }, // setgid32 - { 159, 81, 81, 81, 206, 116 }, // setgroups - { NR_NON, 206, 206, NR_NON, NR_NON, NR_NON }, // setgroups32 - { 143, 71, 71, 71, 204, 114 }, // setregid - { NR_NON, 204, 204, NR_NON, NR_NON, NR_NON }, // setregid32 - { 149, 170, 170, 169, 210, 119 }, // setresgid - { NR_NON, 210, 210, NR_NON, NR_NON, NR_NON }, // setresgid32 - { 147, 164, 164, 164, 208, 117 }, // setresuid - { NR_NON, 208, 208, NR_NON, NR_NON, NR_NON }, // setresuid32 - { 145, 70, 70, 70, 203, 113 }, // setreuid - { NR_NON, 203, 203, NR_NON, NR_NON, NR_NON }, // setreuid32 - { 146, 23, 23, 23, 213, 105 }, // setuid - { NR_NON, 213, 213, NR_NON, NR_NON, NR_NON }, // setuid32 - { NR_END }, // end -}; -int FAKE_MKNOD_NRS[] = - { NR_NON, 14, 14, 14, 14, 133 }; -int FAKE_MKNODAT_NRS[] = - { 33, 324, 297, 288, 290, 259 }; -#endif - /** Global variables **/ @@ -199,19 +104,16 @@ void bind_mount(const char *src, const char *dst, enum bind_dep, const char *newroot, unsigned long flags, const char *scratch); void bind_mounts(const struct bind *binds, const char *newroot, unsigned long flags, const char * scratch); -void enter_udss(struct container *c); -#ifdef HAVE_SECCOMP -void iw(struct sock_fprog *p, int i, - uint16_t op, uint32_t k, uint8_t jt, uint8_t jf); -#endif void join_begin(const char *join_tag); -void join_namespace(pid_t pid, const char *ns); -void join_namespaces(pid_t pid); void join_end(int join_ct); -void sem_timedwait_relative(sem_t *sem, int timeout); -void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, +void mounts_setup(struct container *c); +void namespace_join(pid_t pid, const char *ns); +void namespaces_join(pid_t pid); +void namespaces_setup(const struct container *c, uid_t uid_out, uid_t uid_in, gid_t gid_out, gid_t gid_in); -void setup_passwd(const struct container *c); +void passwd_setup(const struct container *c); +void pivot(struct container *c); +void sem_timedwait_relative(sem_t *sem, int timeout); void tmpfs_mount(const char *dst, const char *newroot, const char *data); @@ -268,7 +170,7 @@ void bind_mounts(const struct bind *binds, const char *newroot, void containerize(struct container *c) { if (c->join_pid) { - join_namespaces(c->join_pid); + namespaces_join(c->join_pid); return; } if (c->join) @@ -278,111 +180,72 @@ void containerize(struct container *c) // fusermount3 non-setuid, and the inner so we get the desired UID // within the container. We do this even if the image is a directory, to // reduce the number of code paths. - setup_namespaces(c, geteuid(), 0, getegid(), 0); + namespaces_setup(c, geteuid(), 0, getegid(), 0); #ifdef HAVE_LIBSQUASHFUSE if (c->type == IMG_SQUASH) sq_fork(c); #endif - setup_namespaces(c, 0, c->container_uid, 0, c->container_gid); - enter_udss(c); + namespaces_setup(c, 0, c->container_uid, 0, c->container_gid); + mounts_setup(c); + VERBOSE("prestart hooks: %d", list_count(c->hooks_prestart, + sizeof(struct hook))); + hooks_run(c, &c->hooks_prestart); + pivot(c); } else - join_namespaces(join.shared->winner_pid); + namespaces_join(join.shared->winner_pid); if (c->join) join_end(c->join_ct); } -/* Enter the new root (UDSS). On entry, the namespaces are set up, and this - does the mounting and filesystem setup. +/* Append hook function f to hook_list. When called, the hook will be passed + d; this lets hooks receive arbitrary arguments (i.e., it’s a poor person’s + closure). hook_list must be a member of c. + + “dup” says what to do if a hook with the same name is already in the list: - Note that pivot_root(2) requires a complex dance to work, i.e., to avoid - multiple undocumented error conditions. This dance is explained in detail - in bin/ch-checkns.c. */ -void enter_udss(struct container *c) + HOOK_DUP_OK add the hook anyway + HOOK_DUP_SKIP silently do nothing (i.e., don’t add the hook) + HOOK_DUP_FAIL fatal error */ +void hook_add(struct hook **hook_list, enum hook_dup dup, + const char *name, hookf_t *f, void *d) { - char *nr_parent, *nr_base, *mkdir_scratch; + // FIXME: hooks: environment variables, seccomp, CDI - LOG_IDS; - mkdir_scratch = NULL; - path_split(c->newroot, &nr_parent, &nr_base); + struct hook h; - // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do - // need both calls to avoid pivot_root(2) failing with EBUSY later. - DEBUG("claiming new root for this namespace") - bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); - bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); - // Re-mount new root read-only unless --write or already read-only. - if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { - unsigned long flags = path_mount_flags(c->newroot) - | MS_REMOUNT // Re-mount ... - | MS_BIND // only this mount point ... - | MS_RDONLY; // read-only. - Z_ (mount(NULL, c->newroot, NULL, flags, NULL)); - } - // Overlay a tmpfs if --write-fake. See for useful details: - // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html - // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html - if (c->overlay_size != NULL) { - char *options; - struct stat st; - VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); - T_ (1 <= asprintf(&options, "size=%s", c->overlay_size)); - Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), - "cannot mount tmpfs for overlay"); - free(options); - Z_ (mkdir(WF_MNT "/upper", 0700)); - Z_ (mkdir(WF_MNT "/work", 0700)); - Z_ (mkdir(WF_MNT "/merged", 0700)); - mkdir_scratch = WF_MNT "/mkdir_overmount"; - Z_ (mkdir(mkdir_scratch, 0700)); - T_ (1 <= asprintf(&options, ("lowerdir=%s,upperdir=%s,workdir=%s," - "index=on,userxattr,volatile"), - c->newroot, WF_MNT "/upper", WF_MNT "/work")); - // update newroot - Zf (stat(c->newroot, &st), - "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); - c->newroot = WF_MNT "/merged"; - free(nr_parent); - free(nr_base); - path_split(c->newroot, &nr_parent, &nr_base); - Zf (mount(NULL, c->newroot, "overlay", 0, options), - "can't overlay: %s, %s", c->newroot, options); - VERBOSE("newroot updated: %s", c->newroot); - free(options); - } - DEBUG("starting bind-mounts"); - // Bind-mount default files and directories. - bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL); - // /etc/passwd and /etc/group. - if (!c->private_passwd) - setup_passwd(c); - // Container /tmp. - if (c->private_tmp) { - tmpfs_mount("/tmp", c->newroot, NULL); - } else { - bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL); + if (dup == HOOK_DUP_SKIP || dup == HOOK_DUP_FAIL) { + bool dup_found = false; + for (int i = 0; (*hook_list)[i].name != NULL; i++) + if (!strcmp((*hook_list)[i].name, name)) { + dup_found = true; + break; + } + if (dup_found) { + Te (dup == HOOK_DUP_SKIP, "invalid duplicate hook: %s", name); + return; // skip adding hook + } } - // Bind-mount user’s home directory at /home/$USER if requested. - if (c->host_home) { - T_ (c->overlay_size != NULL); - bind_mount(c->host_home, cat("/home/", username), - BD_MAKE_DST, c->newroot, 0, mkdir_scratch); + + h.name = name; + h.f = f; + h.data = d; + + list_append((void **)hook_list, &h, sizeof(h)); +} + +/* Run hooks in hook_list, passing c, then set *hook_list to NULL. hook_list + must be a member of c. */ +void hooks_run(struct container *c, struct hook **hook_list) +{ + int hook_ct = list_count(*hook_list, sizeof((*hook_list)[0])); + for (int i = 0; i < hook_ct; i++) { + struct hook h = (*hook_list)[i]; + DEBUG("calling hook %d/%d: %s", i+1, hook_ct, h.name); + h.f(c, h.data); } - // Bind-mount user-specified directories. - bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); - // Overmount / to avoid EINVAL if it’s a rootfs. - Z_ (chdir(nr_parent)); - Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL)); - Z_ (chroot(".")); - // Pivot into the new root. Use /dev because it’s available even in - // extremely minimal images. - c->newroot = cat("/", nr_base); - Zf (chdir(c->newroot), "can't chdir into new root"); - Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")), - "can't pivot_root(2)"); - Zf (chroot("."), "can't chroot(2) into new root"); - Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); - DEBUG("pivot_root(2) dance successful") + + *hook_list = NULL; } /* Return image type of path, or exit with error if not a valid type. */ @@ -426,27 +289,13 @@ enum img_type image_type(const char *ref, const char *storage_dir) char *img_name2path(const char *name, const char *storage_dir) { - char *path; - char *name_fs = strdup(name); + char *name_fs = ch_strdup(name); replace_char(name_fs, '/', '%'); replace_char(name_fs, ':', '+'); - T_ (1 <= asprintf(&path, "%s/img/%s", storage_dir, name_fs)); - - free(name_fs); // make Tim happy - return path; -} - -/* Helper function to write seccomp-bpf programs. */ -#ifdef HAVE_SECCOMP -void iw(struct sock_fprog *p, int i, - uint16_t op, uint32_t k, uint8_t jt, uint8_t jf) -{ - p->filter[i] = (struct sock_filter){ op, jt, jf, k }; - DEBUG("%4d: { op=%2x k=%8x jt=%3d jf=%3d }", i, op, k, jt, jf); + return path_join(storage_dir, path_join("img", name_fs)); } -#endif /* Begin coordinated section of namespace joining. */ void join_begin(const char *join_tag) @@ -515,13 +364,84 @@ void join_end(int join_ct) VERBOSE("join: done"); } +/* Set up the container filesystem tree. Namespaces must already be done. */ +void mounts_setup(struct container *c) +{ + char *nr_parent, *mkdir_scratch; + + VERBOSE("creating container filesystem tree"); + LOG_IDS; + mkdir_scratch = NULL; + path_split(c->newroot, &nr_parent, NULL); + + // Claim new root for this namespace. Despite MS_REC in bind_mount(), we do + // need both calls to avoid pivot_root(2) failing with EBUSY later. + DEBUG("claiming new root for this namespace"); + bind_mount(c->newroot, c->newroot, BD_REQUIRED, "/", MS_PRIVATE, NULL); + bind_mount(nr_parent, nr_parent, BD_REQUIRED, "/", MS_PRIVATE, NULL); + // Re-mount new root read-only unless --write or already read-only. + if (!c->writable && !(access(c->newroot, W_OK) == -1 && errno == EROFS)) { + unsigned long flags = path_mount_flags(c->newroot) + | MS_REMOUNT // Re-mount ... + | MS_BIND // only this mount point ... + | MS_RDONLY; // read-only. + Z_ (mount(NULL, c->newroot, NULL, flags, NULL)); + } + // Overlay a tmpfs if --write-fake. See for useful details: + // https://www.kernel.org/doc/html/v5.11/filesystems/tmpfs.html + // https://www.kernel.org/doc/html/v5.11/filesystems/overlayfs.html + if (c->overlay_size != NULL) { + char *options; + struct stat st; + VERBOSE("overlaying tmpfs for --write-fake (%s)", c->overlay_size); + options = cat("size=", c->overlay_size); + Zf (mount(NULL, WF_MNT, "tmpfs", 0, options), + "cannot mount tmpfs for overlay"); + Z_ (mkdir(WF_MNT "/upper", 0700)); + Z_ (mkdir(WF_MNT "/work", 0700)); + Z_ (mkdir(WF_MNT "/merged", 0700)); + mkdir_scratch = WF_MNT "/mkdir_overmount"; + Z_ (mkdir(mkdir_scratch, 0700)); + options = ch_asprintf(("lowerdir=%s,upperdir=%s,workdir=%s," + "index=on,userxattr,volatile"), + c->newroot, WF_MNT "/upper", WF_MNT "/work"); + // update newroot + Zf (stat(c->newroot, &st), + "can't stat new root; overmounted by tmpfs for -W?: %s", c->newroot); + c->newroot = WF_MNT "/merged"; + Zf (mount(NULL, c->newroot, "overlay", 0, options), + "can't overlay: %s, %s", c->newroot, options); + VERBOSE("newroot updated: %s", c->newroot); + } + DEBUG("starting bind-mounts"); + // Bind-mount default files and directories. + bind_mounts(BINDS_DEFAULT, c->newroot, MS_RDONLY, NULL); + // /etc/passwd and /etc/group. + if (!c->private_passwd) + passwd_setup(c); + // Container /tmp. + if (c->private_tmp) { + tmpfs_mount("/tmp", c->newroot, NULL); + } else { + bind_mount(host_tmp, "/tmp", BD_REQUIRED, c->newroot, 0, NULL); + } + // Bind-mount user’s home directory at /home/$USER if requested. + if (c->host_home) { + T_ (c->overlay_size != NULL); + bind_mount(c->host_home, cat("/home/", username), + BD_MAKE_DST, c->newroot, 0, mkdir_scratch); + } + // Bind-mount user-specified directories. + bind_mounts(c->binds, c->newroot, 0, mkdir_scratch); +} + /* Join a specific namespace. */ -void join_namespace(pid_t pid, const char *ns) +void namespace_join(pid_t pid, const char *ns) { char *path; int fd; - T_ (1 <= asprintf(&path, "/proc/%d/ns/%s", pid, ns)); + path = ch_asprintf("/proc/%d/ns/%s", pid, ns); fd = open(path, O_RDONLY); if (fd == -1) { if (errno == ENOENT) { @@ -543,197 +463,23 @@ void join_namespace(pid_t pid, const char *ns) } } -/* Join the existing namespaces created by the join winner. */ -void join_namespaces(pid_t pid) +/* Join the existing namespaces containing process pid, which could be the + join winner or another process. */ +void namespaces_join(pid_t pid) { VERBOSE("joining namespaces of pid %d", pid); - join_namespace(pid, "user"); - join_namespace(pid, "mnt"); -} - -/* Replace the current process with user command and arguments. */ -void run_user_command(char *argv[], const char *initial_dir) -{ - LOG_IDS; - - if (initial_dir != NULL) - Zf (chdir(initial_dir), "can't cd to %s", initial_dir); - - VERBOSE("executing: %s", argv_to_string(argv)); - - Zf (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), "can't set no_new_privs"); - if (verbose < LL_INFO) - T_ (freopen("/dev/null", "w", stdout)); - if (verbose < LL_STDERR) - T_ (freopen("/dev/null", "w", stderr)); - execvp(argv[0], argv); // only returns if error - Tf (0, "can't execve(2): %s", argv[0]); -} - -/* Set up the fake-syscall seccomp(2) filter. This computes and installs a - long-ish but fairly simple BPF program to implement the filter. To - understand this rather hairy language: - - 1. https://man7.org/training/download/secisol_seccomp_slides.pdf - 2. https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html - 3. https://elixir.bootlin.com/linux/latest/source/samples/seccomp */ -#ifdef HAVE_SECCOMP -void seccomp_install(void) -{ - int arch_ct = sizeof(SECCOMP_ARCHS)/sizeof(SECCOMP_ARCHS[0]) - 1; - int syscall_cts[arch_ct]; - struct sock_fprog p = { 0 }; - int ii, idx_allow, idx_fake, idx_mknod, idx_mknodat, idx_next_arch; - // Lengths of certain instruction groups. These are all obtained manually - // by counting below, violating DRY. We could automate these counts, but it - // seemed like the cost of extra buffers and code to do that would exceed - // that of maintaining the manual counts. - int ct_jump_start = 4; // ld arch & syscall nr, arch test, end-of-arch jump - int ct_mknod_jump = 2; // jump table handling for mknod(2) and mknodat(2) - int ct_mknod = 2; // mknod(2) handling - int ct_mknodat = 6; // mknodat(2) handling - - // Count how many syscalls we are going to fake in the standard way. We - // need this to compute the right offsets for all the jumps. - for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { - p.len += ct_jump_start + ct_mknod_jump; - syscall_cts[ai] = 0; - for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { - bool syscall_p = FAKE_SYSCALL_NRS[si][ai] != NR_NON; - syscall_cts[ai] += syscall_p; - p.len += syscall_p; // syscall jump table entry - } - DEBUG("seccomp: arch %x: found %d syscalls", - SECCOMP_ARCHS[ai], syscall_cts[ai]); - } - - // Initialize program buffer. - p.len += ( 1 // return allow - + 1 // return fake success - + ct_mknod // mknod(2) handling - + ct_mknodat); // mknodat(2) handling - DEBUG("seccomp(2) program has %d instructions", p.len); - T_ (p.filter = calloc(p.len, sizeof(struct sock_filter))); - - // Return call addresses. Allow needs to come first because we’ll jump to - // it for unknown architectures. - idx_allow = p.len - 2 - ct_mknod - ct_mknodat; - idx_fake = p.len - 1 - ct_mknod - ct_mknodat; - idx_mknod = p.len - ct_mknod - ct_mknodat; - idx_mknodat = p.len - ct_mknodat; - - // Build a jump table for each architecture. The gist is: if architecture - // matches, fall through into the jump table, otherwise jump to the next - // architecture (or ALLOW for the last architecture). - ii = 0; - idx_next_arch = -1; // avoid warning on some compilers - for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { - int jump; - idx_next_arch = ii + syscall_cts[ai] + ct_jump_start + ct_mknod_jump; - // load arch into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, arch), 0, 0); - // jump to next arch if arch doesn't match - jump = idx_next_arch - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, SECCOMP_ARCHS[ai], 0, jump); - // load syscall number into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr), 0, 0); - // jump table of syscalls - for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { - int nr = FAKE_SYSCALL_NRS[si][ai]; - if (nr != NR_NON) { - jump = idx_fake - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, nr, jump, 0); - } - } - // jump to mknod(2) handling (add even if syscall not implemented to - // make the instruction counts simpler) - jump = idx_mknod - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNOD_NRS[ai], jump, 0); - // jump to mknodat(2) handling - jump = idx_mknodat - ii - 1; - T_ (jump <= 255); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNODAT_NRS[ai], jump, 0); - // unfiltered syscall, jump to allow (limit of 255 doesn’t apply to JA) - jump = idx_allow - ii - 1; - iw(&p, ii++, BPF_JMP|BPF_JA, jump, 0, 0); - } - T_ (idx_next_arch == idx_allow); - - // Returns. (Note that if we wanted a non-zero errno, we’d bitwise-or with - // SECCOMP_RET_ERRNO. But because fake success is errno == 0, we don’t need - // a no-op “| 0”.) - T_ (ii == idx_allow); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); - T_ (ii == idx_fake); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); - - // mknod(2) handling. This just loads the file mode and jumps to the right - // place in the mknodat(2) handling. - T_ (ii == idx_mknod); - // load mode argument into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, args[1]), 0, 0); - // jump to mode test - iw(&p, ii++, BPF_JMP|BPF_JA, 1, 0, 0); - - // mknodat(2) handling. - T_ (ii == idx_mknodat); - // load mode argument into accumulator - iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, args[2]), 0, 0); - // jump to fake return if trying to create a device. - iw(&p, ii++, BPF_ALU|BPF_AND|BPF_K, S_IFMT, 0, 0); // file type only - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFCHR, 2, 0); - iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFBLK, 1, 0); - // returns - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); - iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); - - // Install filter. Use prctl(2) rather than seccomp(2) for slightly greater - // compatibility (Linux 3.5 rather than 3.17) and because there is a glibc - // wrapper. - T_ (ii == p.len); // next instruction now one past the end of the buffer - Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p)); - DEBUG("note: see FAQ to disassemble the above") - - // Test filter. This will fail if the kernel executes the call (because we - // are not really privileged and the arguments are bogus) or succeed if - // filter handles it. We selected it over something more naturally in the - // filter, e.g. setuid(2), because (1) no container process should ever use - // it and (2) it’s unlikely to be emulated by a smarter filter in the - // future, i.e., it won’t silently start doing something. - Zf (syscall(SYS_kexec_load, 0, 0, NULL, 0), - "seccomp root emulation failed (is your architecture supported?)"); -} -#endif - -/* Wait for semaphore sem for up to timeout seconds. If timeout or an error, - exit unsuccessfully. */ -void sem_timedwait_relative(sem_t *sem, int timeout) -{ - struct timespec deadline; - - // sem_timedwait() requires a deadline rather than a timeout. - Z_ (clock_gettime(CLOCK_REALTIME, &deadline)); - deadline.tv_sec += timeout; - - if (sem_timedwait(sem, &deadline)) { - Ze (errno == ETIMEDOUT, "timeout waiting for join lock"); - Tf (0, "failure waiting for join lock"); - } + namespace_join(pid, "user"); + namespace_join(pid, "mnt"); } /* Activate the desired isolation namespaces. */ -void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, +void namespaces_setup(const struct container *c, uid_t uid_out, uid_t uid_in, gid_t gid_out, gid_t gid_in) { int fd; + VERBOSE("setting up namespaces: %d:%d -> %d:%d", + uid_out, gid_out, uid_in, gid_in); LOG_IDS; Zf (unshare(CLONE_NEWNS|CLONE_NEWUSER), "can't init user+mount namespaces"); LOG_IDS; @@ -776,7 +522,7 @@ void setup_namespaces(const struct container *c, uid_t uid_out, uid_t uid_in, see issue #212. After bind-mounting, we remove the files from the host; they persist inside the container and then disappear completely when the container exits. */ -void setup_passwd(const struct container *c) +void passwd_setup(const struct container *c) { int fd; char *path; @@ -784,7 +530,7 @@ void setup_passwd(const struct container *c) struct passwd *p; // /etc/passwd - T_ (path = cat(host_tmp, "/ch-run_passwd.XXXXXX")); + path = cat(host_tmp, "/ch-run_passwd.XXXXXX"); T_ (-1 != (fd = mkstemp(path))); // mkstemp(3) writes path if (c->container_uid != 0) T_ (1 <= dprintf(fd, "root:x:0:0:root:/root:/bin/sh\n")); @@ -809,7 +555,7 @@ void setup_passwd(const struct container *c) Z_ (unlink(path)); // /etc/group - T_ (path = cat(host_tmp, "/ch-run_group.XXXXXX")); + path = cat(host_tmp, "/ch-run_group.XXXXXX"); T_ (-1 != (fd = mkstemp(path))); if (c->container_gid != 0) T_ (1 <= dprintf(fd, "root:x:0:\n")); @@ -832,6 +578,66 @@ void setup_passwd(const struct container *c) Z_ (unlink(path)); } +/* Pivot into the container. Note that pivot_root(2) requires a complex dance + to work, i.e., to avoid multiple undocumented error conditions. This dance + is explained in detail in bin/ch-checkns.c. */ +void pivot(struct container *c) +{ + char *nr_parent, *nr_base; + + VERBOSE("pivoting into container"); + path_split(c->newroot, &nr_parent, &nr_base); + + // Overmount / to avoid EINVAL if it’s a rootfs. + Z_ (chdir(nr_parent)); + Z_ (mount(nr_parent, "/", NULL, MS_MOVE, NULL)); + Z_ (chroot(".")); + // Pivot into the new root. Use /dev because it’s available even in + // extremely minimal images. + c->newroot = cat("/", nr_base); + Zf (chdir(c->newroot), "can't chdir into new root"); + Zf (syscall(SYS_pivot_root, c->newroot, path_join(c->newroot, "dev")), + "can't pivot_root(2)"); + Zf (chroot("."), "can't chroot(2) into new root"); + Zf (umount2("/dev", MNT_DETACH), "can't umount old root"); +} + +/* Replace the current process with user command and arguments. */ +void run_user_command(char *argv[], const char *initial_dir) +{ + LOG_IDS; + + if (initial_dir != NULL) + Zf (chdir(initial_dir), "can't cd to %s", initial_dir); + + VERBOSE("executing: %s", argv_to_string(argv)); + + Zf (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), "can't set no_new_privs"); + if (verbose < LL_INFO) + T_ (freopen("/dev/null", "w", stdout)); + if (verbose < LL_STDERR) + T_ (freopen("/dev/null", "w", stderr)); + ch_memory_log("usrx"); + execvp(argv[0], argv); // only returns if error + Tf (0, "can't execve(2): %s", argv[0]); +} + +/* Wait for semaphore sem for up to timeout seconds. If timeout or an error, + exit unsuccessfully. */ +void sem_timedwait_relative(sem_t *sem, int timeout) +{ + struct timespec deadline; + + // sem_timedwait() requires a deadline rather than a timeout. + Z_ (clock_gettime(CLOCK_REALTIME, &deadline)); + deadline.tv_sec += timeout; + + if (sem_timedwait(sem, &deadline)) { + Ze (errno == ETIMEDOUT, "timeout waiting for join lock"); + Tf (0, "failure waiting for join lock"); + } +} + /* Mount a tmpfs at the given path. */ void tmpfs_mount(const char *dst, const char *newroot, const char *data) { diff --git a/bin/ch_core.h b/bin/core.h similarity index 67% rename from bin/ch_core.h rename to bin/core.h index f65cfc083..8615629fc 100644 --- a/bin/ch_core.h +++ b/bin/core.h @@ -1,9 +1,12 @@ /* Copyright © Triad National Security, LLC, and others. - This interface contains Charliecloud's core containerization features. */ + This interface contains Charliecloud’s core containerization features. */ #define _GNU_SOURCE +#pragma once + #include +#include /** Types **/ @@ -20,6 +23,20 @@ struct bind { enum bind_dep dep; }; +struct container; // forward declaration to avoid definition loop +typedef void (hookf_t)(struct container *, void *); +struct hook { + const char *name; + hookf_t *f; + void *data; +}; + +enum hook_dup { // see hook_add() + HOOK_DUP_OK, + HOOK_DUP_SKIP, + HOOK_DUP_FAIL +}; + enum img_type { IMG_DIRECTORY, // normal directory, perhaps an external mount of some kind IMG_SQUASH, // SquashFS archive file (not yet mounted) @@ -32,16 +49,18 @@ struct container { gid_t container_gid; // GID to use in container uid_t container_uid; // UID to use in container bool env_expand; // expand variables in --set-env + struct hook *hooks_prestart; // prestart hook functions and their arguments char *host_home; // if --home, host path to user homedir, else NULL char *img_ref; // image description from command line + char **ldconfigs; // directories to pass to image’s ldconfig(8) char *newroot; // path to new root directory bool join; // is this a synchronized join? int join_ct; // number of peers in a synchronized join pid_t join_pid; // process in existing namespace to join char *join_tag; // identifier for synchronized join char *overlay_size; // size of overlaid tmpfs (NULL for no overlay) - bool private_passwd; // don't bind custom /etc/{passwd,group} - bool private_tmp; // don't bind host's /tmp + bool private_passwd; // don’t bind custom /etc/{passwd,group} + bool private_tmp; // don’t bind host's /tmp enum img_type type; // directory, SquashFS, etc. bool writable; // re-mount image read-write }; @@ -50,9 +69,9 @@ struct container { /** Function prototypes **/ void containerize(struct container *c); +void hook_add(struct hook **hook_list, enum hook_dup dup, + const char *name, hookf_t *f, void *d); +void hooks_run(struct container *c, struct hook **hook_list); enum img_type image_type(const char *ref, const char *images_dir); char *img_name2path(const char *name, const char *storage_dir); void run_user_command(char *argv[], const char *initial_dir); -#ifdef HAVE_SECCOMP -void seccomp_install(void); -#endif diff --git a/bin/ch_fuse.c b/bin/fuse.c similarity index 97% rename from bin/ch_fuse.c rename to bin/fuse.c index ce60bbcc7..a6b0bc1da 100644 --- a/bin/ch_fuse.c +++ b/bin/fuse.c @@ -35,10 +35,11 @@ // Now we can include ll.h. #include -#include "config.h" -#include "ch_core.h" -#include "ch_fuse.h" -#include "ch_misc.h" +#include "config.h" // here to avoid potential clash with SquashFUSE config.h +#include "core.h" +#include "fuse.h" +#include "mem.h" +#include "misc.h" /** Types **/ @@ -121,8 +122,7 @@ void sq_fork(struct container *c) // Default mount point? if (c->newroot == NULL) { - char *subdir; - T_ (asprintf(&subdir, "/%s.ch/mnt", username) > 0); + char *subdir = ch_asprintf("/%s.ch/mnt", username); c->newroot = cat("/var/tmp", subdir); VERBOSE("using default mount point: %s", c->newroot); mkdirs("/var/tmp", subdir, NULL, NULL); @@ -141,8 +141,7 @@ void sq_fork(struct container *c) // Now that the filesystem is mounted, we can fork without race condition. // The child returns to caller and runs the user command. When that exits, // the parent gets SIGCHLD. - pid_child = fork(); - Tf (pid_child >= 0, "can't fork"); + pid_child = ch_fork(); if (pid_child > 0) // parent (child does nothing here) exit(sq_loop()); } @@ -204,7 +203,7 @@ int sq_loop(void) // [1]: https://codereview.stackexchange.com/a/109349 // [2]: https://man7.org/linux/man-pages/man2/wait.2.html exit_code = 1; - VERBOSE("child terminated by signal %d", WTERMSIG(child_status)) + VERBOSE("child terminated by signal %d", WTERMSIG(child_status)); } } @@ -229,7 +228,7 @@ void sq_mount(const char *img_path, char *mountpt) struct fuse_args mount_args = FUSE_ARGS_INIT(mount_argc, mount_argv); sq.mountpt = mountpt; - T_ (sq.chan = malloc(sizeof(sqfs_ll_chan))); + sq.chan = ch_malloc(sizeof(sqfs_ll_chan), true); sq.ll = sqfs_ll_open(img_path, 0); Te (sq.ll != NULL, "can't open SquashFS: %s; try ch-run -vv?", img_path); diff --git a/bin/ch_fuse.h b/bin/fuse.h similarity index 91% rename from bin/ch_fuse.h rename to bin/fuse.h index 5250ed85a..bc756c54d 100644 --- a/bin/ch_fuse.h +++ b/bin/fuse.h @@ -1,6 +1,7 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#pragma once /** Function prototypes **/ diff --git a/bin/hook.c b/bin/hook.c new file mode 100644 index 000000000..7a79bcab9 --- /dev/null +++ b/bin/hook.c @@ -0,0 +1,41 @@ +/* Copyright © Triad National Security, LLC, and others. */ + +#define _GNU_SOURCE +#include "config.h" + +#include + +#include "core.h" +#include "hook.h" +#include "misc.h" + + +/** Function prototypes (private) **/ + + +/** Functions **/ + +/* Set the environment variables listed in d. */ +void hook_envs_set(struct container *c, void *d) +{ + struct env_var *vars = d; + envs_set(vars, c->env_expand); +} + +/* Set the environment variables specified in file d. */ +void hook_envs_set_file(struct container *c, void *d) +{ + struct env_file *ef = d; + envs_set(env_file_read(ef->path, ef->delim), c->env_expand); +} + +/* Unset the environment variables matching glob d. */ +void hook_envs_unset(struct container *c, void *d) +{ + envs_unset((char *)d); +} + + +void hook_ldconfig(struct container *c, void *d) +{ +} diff --git a/bin/hook.h b/bin/hook.h new file mode 100644 index 000000000..ce4426d9f --- /dev/null +++ b/bin/hook.h @@ -0,0 +1,25 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains hooks that don’t deserve their own file. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" +#include "misc.h" + + +/** Types **/ + +struct env_file { + char *path; + char delim; + bool expand; +}; + + +/** Function prototypes **/ + +void hook_envs_set_file(struct container *c, void *d); +void hook_envs_set(struct container *c, void *d); +void hook_envs_unset(struct container *c, void *d); diff --git a/bin/json.c b/bin/json.c new file mode 100644 index 000000000..adf6d5182 --- /dev/null +++ b/bin/json.c @@ -0,0 +1,493 @@ +/* Copyright © Triad National Security, LLC, and others. */ + +#define _GNU_SOURCE +#include "config.h" + +#include +#include +#include +#include +#include +#include + +#include CJSON_H + +#include "core.h" +#include "json.h" +#include "mem.h" +#include "misc.h" + + +/** Macros **/ + + +/** Types **/ + +/* Dispatch table row for CDI hook emulation. + + We could alternately put args last, making it a “flexible array member”. + That would make the field order slightly sub-optimal, but more importantly + it would make sizeof() return misleading results, which seems like a + nasty trap waiting for someone. */ +#define HOOK_ARG_MAX 3 +struct cdi_hook_dispatch { + size_t arg_ct; // number of arguments to compare + char *args[HOOK_ARG_MAX]; // matching arguments + void (*f)(void *, char **args); // NULL to ignore quietly +}; +#define HDF void (*)(void *, char **args) // to cast in dispatch tables + +struct cdi_spec { + char *kind; + char *src_path; // source spec file path + dev_t src_dev; // ... device ID + ino_t src_ino; // ... inode number + struct env_var *envs; + struct bind *binds; + char **ldconfigs; // directories to process with ldconfig(8) +}; + +struct json_dispatch { + char *name; + struct json_dispatch *children; + void (*f)(cJSON *tree, void *state); +}; +#define JDF void (*)(cJSON *, void *) // to cast callbacks in dispatch tables + + +/** Constants **/ + +// Block size in bytes for reading JSON files. +const size_t READ_SZ = 16384; + + +/** Function prototypes (private) **/ + +char **array_strings_json_to_c(cJSON *jarry, size_t *ct); +void cdi_append(struct cdi_spec **specs, struct cdi_spec *spec); +void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args); +char *cdi_hook_to_string(const char *hook_name, char **args); +void cdi_log(struct cdi_spec *spec); +struct cdi_spec *cdi_read(const char *path); +struct cdi_spec *cdi_read_maybe(struct cdi_spec *specs, const char *path); +bool cdi_requested(struct cdi_config *cf, struct cdi_spec *spec); +void visit(struct json_dispatch actions[], cJSON *tree, void *state); +void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state); + +// parser callbacks +void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec); +void cdiPC_env(cJSON *tree, struct cdi_spec *spec); +void cdiPC_hook(cJSON *tree, struct cdi_spec *spec); +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec); + + +/** Globals **/ + +/* List of CDI specs we’ve read. Yes it’s a global, but that lets us keep + struct cdi_spec private to this file, which seemed like the right + trade-off. It also seemed like “all the specs we know about” wasn’t + something we needed multiple of. */ +struct cdi_spec *cdi_specs = NULL; + +/* Callback tables. In the struct, the callback’s second argument is “void *” + so any state object can be provided. However, we’d prefer the actual + functions to take the correct pointer type; thus, they need to be cast. + Alternatives include: + + 1. Cast every use of the variable in the callbacks. This seemed verbose + and error-prone. + + 2. Add a local variable of the correct type to each callback. I thought + such distributed boilerplate seemed worse. */ +struct json_dispatch cdiPD_containerEdits[] = { + { "env", NULL, (JDF)cdiPC_env }, + { "hooks", NULL, (JDF)cdiPC_hook }, + { } +}; +struct json_dispatch cdiPD_root[] = { + { "cdiVersion", NULL, (JDF)cdiPC_cdiVersion }, + { "kind", NULL, (JDF)cdiPC_kind }, + { "containerEdits", cdiPD_containerEdits, }, + { } +}; + +/* CDI hook dispatch table. */ +struct cdi_hook_dispatch cdi_hooks[] = { + { 2, { "nvidia-ctk-hook", "update-ldcache" }, (HDF)cdi_hook_nv_ldcache }, + { 3, { "nvidia-ctk", "hook", "update-ldcache" }, (HDF)cdi_hook_nv_ldcache }, + { 2, { "nvidia-ctk-hook", "chmod" }, NULL }, + { 3, { "nvidia-ctk", "hook", "chmod" }, NULL }, + { 2, { "nvidia-ctk-hook", "create-symlinks" }, NULL }, + { 3, { "nvidia-ctk", "hook", "create-symlinks" }, NULL }, + { } +}; + + +/** Functions **/ + + +/* Given JSON array of strings jar, which may be of length zero, convert it to + a freshly allocated NULL-terminated array of C strings (pointers to + null-terminated chars buffers) and return that. ct is an out parameter + + WARNING: This is a shallow copy, i.e., the actual strings are still shared + with the JSON array. */ +char **array_strings_json_to_c(cJSON *jarry, size_t *ct) +{ + size_t i; + char **carry; + cJSON *j; + + Tf (cJSON_IsArray(jarry), "JSON: expected array"); + *ct = cJSON_GetArraySize(jarry); + carry = ch_malloc((*ct + 1) * sizeof(char *), true); + carry[*ct] = NULL; + + i = 0; + cJSON_ArrayForEach(j, jarry) { + Tf (cJSON_IsString(j), "JSON: expected string"); + carry[i++] = j->valuestring; + } + + return carry; +} + +/* Return true if devid is a device kind (e.g. “nvidia.com/gpu”), false if + it’s a path. Exit with error if NULL pointer or empty string. */ +bool cdi_devid_kind_p(const char *devid) +{ + T_ (devid != NULL && devid[0] != '\0'); + return (devid[0] != '.' && devid[0] != '/'); +} + +/* Return a list of environment variables to be set for device devid, which + can be either a device kind or a path, or if devid is NULL, all known + devices. */ +struct env_var *cdi_envs_get(const char *devid) +{ + struct env_var *vars = list_new(sizeof(struct env_var), 0); + + for (int i = 0; cdi_specs[i].kind != NULL; i++) { + // Compare devid with both kind and path without checking what it is + // because it seemed the odds of false positive low enough. + if ( devid == NULL + || !strcmp(devid, cdi_specs[i].kind) + || !strcmp(devid, cdi_specs[i].src_path)) + list_append((void **)&vars, cdi_specs[i].envs, sizeof(vars[0])); + } + + return vars; +} + +void cdi_hook_nv_ldcache(struct cdi_spec *spec, char **args) +{ + for (size_t i = 0; args[i] != NULL; i++) + if (!strcmp("--folder", args[i])) { + char *dir; + T_ (args[i+1] != NULL); + T_ (dir = strdup(args[i+1])); + // FIXME: YOU ARE HERE: APPEND ONLY IF WE DON'T ALREADY HAVE DIR + list_append((void **)&spec->ldconfigs, &dir, sizeof(dir)); + i++; + } +} + +/* Return a freshly allocated string describing the given hook, for logging. */ +char *cdi_hook_to_string(const char *hook_name, char **args) +{ + char *args_str; + + args_str = ""; + for (size_t i = 0; args[i] != NULL; i++) + args_str = cats(3, args_str, " ", args[i]); + + return ch_asprintf("%s:%s", hook_name, args_str); +} + +/* Read the CDI spec files we need. + + Note: We only read spec files in the search path directories if either + (a) --devices is specified, requesting all known devices or (b) a device + kind (rather than a filename) is given to --device (e.g., “nvidia.com/gpu”. + This protects users from errors in the spec files if they have not + requested any CDI features. */ +void cdi_init(struct cdi_config *cf) +{ + bool req_by_kind = false; + + // Initialize specs list. + T_ (cdi_specs == NULL); + cdi_specs = list_new(sizeof(struct cdi_spec), 0); + + // Read CDI spec files specifically requested. + for (int i = 0; cf->devids[i] != NULL; i++) + if (cdi_devid_kind_p(cf->devids[i])) + req_by_kind = true; + else { + struct cdi_spec *spec = cdi_read_maybe(cdi_specs, cf->devids[i]); + if (spec != NULL) + list_append((void **)&cdi_specs, spec, sizeof(*spec)); + } + + // Read CDI spec files in configured directories if neccessary. + if (cf->devs_all_p || req_by_kind) + for (int i = 0; cf->spec_dirs[i] != NULL; i++) { + char **entries = dir_glob(cf->spec_dirs[i], "*.json"); + for (int j = 0; entries[j] != NULL; j++) { + struct cdi_spec *spec; + spec = cdi_read_maybe(cdi_specs, + path_join(cf->spec_dirs[i], entries[j])); + if (spec != NULL && cdi_requested(cf, spec)) + list_append((void **)&cdi_specs, spec, sizeof(*spec)); + } + } + + // debugging: print parsed CDI specs + DEBUG("CDI: read %d specs", list_count(cdi_specs, sizeof(cdi_specs[0]))); + for (size_t i = 0; cdi_specs[i].kind != NULL; i++) + cdi_log(&cdi_specs[0]); + +/* + // update c + for (size_t i = 0; specs[i] != NULL; i++) { + // ldconfigs; copy rather than assigning because (1) easier to free + // and (2) still works if we later grow other sources of ldconfig. + list_cat((void **)&c->ldconfigs, (void *)specs[i]->ldconfigs, + sizeof(c->ldconfigs[0])); + } +*/ +} + +/* Log contents of spec. */ +void cdi_log(struct cdi_spec *spec) +{ + size_t ct; + + DEBUG("CDI: %s from %s (%u,%u %u):", spec->kind, spec->src_path, + major(spec->src_dev), minor(spec->src_dev), spec->src_ino); + ct = list_count((void *)(spec->envs), sizeof(struct env_var)); + DEBUG("CDI: environment: %d:", ct); + for (size_t i = 0; i < ct; i++) + DEBUG("CDI: %s=%s", spec->envs[i].name, spec->envs[i].value); + ct = list_count((void *)(spec->binds), sizeof(struct bind)); + DEBUG("CDI: bind mounts: %d:", ct); + for (size_t i = 0; i < ct; i++) + DEBUG("CDI: %s -> %s", spec->binds[i].src, spec->binds[i].dst); + ct = list_count((void *)(spec->ldconfigs), sizeof(char *)); + DEBUG("CDI: ldconfig directories: %d:", ct); + for (size_t i = 0; i < ct; i++) + DEBUG("CDI: %s", spec->ldconfigs[i]); +} + +/* Read and parse the CDI spec file at path. Return a pointer to the parsed + struct. If something goes wrong, exit with error. */ +struct cdi_spec *cdi_read(const char *path) +{ + FILE *fp; + struct stat st; + char *text = NULL; + const char *parse_end; + cJSON *tree; + struct cdi_spec *spec = NULL; + + // Read file into string. Allocate incrementally rather than seeking so + // non-seekable input works. + Tf (fp = fopen(path, "rb"), "CDI: can't open: %s", path); + Zf (fstat(fileno(fp), &st), "CDI: can't stat: %s", path); + for (size_t used = 0, avail = READ_SZ; true; avail += READ_SZ) { + size_t read_ct; + text = ch_realloc(text, avail, false); + read_ct = fread(text + used, 1, READ_SZ, fp); + used += read_ct; + if (read_ct < READ_SZ) { + if (feof(fp)) { // EOF reached + T_ (used < avail); + text[used] = '\0'; // terminate string + break; + } + Tf(0, "CDI: can't read: %s", path); + } + } + + // Parse JSON. + tree = cJSON_ParseWithOpts(text, &parse_end, false); + Tf(tree != NULL, "CDI: JSON failed at byte %d: %s", parse_end - text, path); + + // Visit parse tree to build our struct. + spec = ch_malloc(sizeof(struct cdi_spec), true); + spec->src_path = (char *)path; // shouldn’t ever be written + spec->src_dev = st.st_dev; + spec->src_ino = st.st_ino; + visit(cdiPD_root, tree, spec); + + // Clean up. + VERBOSE("CDI: spec read OK: %s: %s", spec->kind, path); + return spec; +} + +/* Read and parse the CDI spec file at path, returning a pointer to the + newly-allocated spec struct, unless (1) we already read the file, in which + case log that fact and return NULL, or (2) the device kind has already been + specified, in which case exit with error. If something else goes wrong, + also exit with error. */ +struct cdi_spec *cdi_read_maybe(struct cdi_spec *specs, const char *path) +{ + struct cdi_spec *spec; + struct stat st; + + // Don’t read file if we already did. It’s relatively easy to give a spec + // file more than once, e.g. if it’s in the search path and also an + // argument to --device. + for (int i = 0; specs[i].kind != NULL; i++) { + Zf (stat(path, &st), "can’t stat CDI spec: %s", path); + if (st.st_dev == specs[i].src_dev && st.st_ino == specs[i].src_ino) { + VERBOSE("CDI: spec already read, skipping: %s", path); + return NULL; + } + } + + spec = cdi_read(path); + + // Error if this device already specified, which because we don’t re-read + // files means two files specified the same device kind. + for (int i = 0; specs[i].kind != NULL; i++) + Te (strcmp(spec->kind, specs[i].kind), + "CDI: device found in multiple spec files: %s: %s and %s", + spec->kind, specs[i].src_path, spec->src_path); + + return spec; +} + +/* Return true if the given spec was requested by configuration cf, false + otherwise. */ +bool cdi_requested(struct cdi_config *cf, struct cdi_spec *spec) +{ + if (cf->devs_all_p) + return true; + + for (int i; cf->devids[i] != NULL; i++) + if ( cdi_devid_kind_p(cf->devids[i]) + && !strcmp(cf->devids[i], spec->kind)) + return true; + + return false; +} + +void cdiPC_cdiVersion(cJSON *tree, struct cdi_spec *spec) +{ + DEBUG("CDI: %s: version %s", spec->src_path, tree->valuestring); +} + +void cdiPC_env(cJSON *tree, struct cdi_spec *spec) +{ + struct env_var ev; + size_t name_len, value_len; // not including null terminator + char *delim, *arnold; + + T_ (cJSON_IsString(tree)); + T_ (delim = strchr(tree->valuestring, '=')); + T_ (arnold = strchr(tree->valuestring, 0)); + + name_len = delim - tree->valuestring; + value_len = arnold - delim - 1; + T_ (ev.name = malloc(name_len + 1)); + memcpy(ev.name, tree->valuestring, name_len); + ev.name[name_len] = 0; + T_ (ev.value = malloc(value_len + 1)); + memcpy(ev.value, delim + 1, value_len); + ev.value[value_len] = 0; + + list_append((void **)&spec->envs, &ev, sizeof(ev)); +} + +void cdiPC_hook(cJSON *tree, struct cdi_spec *spec) +{ + char **args; + size_t arg_ct; + char *hook_name; + char *hook_str; + bool hook_known; + //struct cdi_hook_dispatch hook; + + T_ (hook_name = cJSON_GetStringValue(cJSON_GetObjectItem(tree, "hookName"))); + + T_ (cJSON_IsArray(cJSON_GetObjectItem(tree, "args"))); + args = array_strings_json_to_c(cJSON_GetObjectItem(tree, "args"), &arg_ct); + hook_str = cdi_hook_to_string(hook_name, args); + + hook_known = false; + for (size_t i = 0; cdi_hooks[i].arg_ct != 0; i++) { // for each table row + if (arg_ct >= cdi_hooks[i].arg_ct) { // enough hook args to compare + for (size_t j = 0; j < cdi_hooks[i].arg_ct; j++) + if (strcmp(args[j], cdi_hooks[i].args[j])) + goto continue_outer; + hook_known = true; // all words matched + if (cdi_hooks[i].f == NULL) { + DEBUG("CDI: ignoring known hook: %s", hook_str); + } else { + DEBUG("CDI: emulating known hook: %s", hook_str); + cdi_hooks[i].f(spec, &args[cdi_hooks[i].arg_ct]); + } + break; // only call one hook function + } + continue_outer: + } + + if (!hook_known) + WARNING("CDI: ignoring unknown hook: %s", hook_str); +} + +void cdiPC_kind(cJSON *tree, struct cdi_spec *spec) +{ + T_ (spec->kind = strdup(tree->valuestring)); +} + +/* Initialize the cJSON stuff. Quirks: + + 1. Despite using reallocation internally, cJSON indeed does not accept a + realloc(3) replacement, though it possibly used to. If malloc(3) and + free(3) are provided, then it just doesn’t call any realloc(). + + Weirdly, cJSON appears to have a notion of “internal” memory management + that uses malloc(3), realloc(3), and free(3) regardless of these hooks. + + 2. cJSON prefixes everything with CJSON_CDECL, which is juts __cdecl, which + is unnecessary for C code. Maybe this is for using cJSON in C++? */ +void json_init(void) +{ + cJSON_Hooks hooks = (cJSON_Hooks) { + .malloc_fn = ch_malloc_pointerful, + .free_fn = ch_free_noop, + }; + + cJSON_InitHooks(&hooks); +} + +/* Visit each node in the parse tree in depth-first order. At each node, if + there is a matching callback in actions, call it. For arrays, call the + callback once per array element. */ +void visit(struct json_dispatch actions[], cJSON *tree, void *state) +{ + for (int i = 0; actions[i].name != NULL; i++) { + cJSON *subtree = cJSON_GetObjectItem(tree, actions[i].name); + if (subtree != NULL) { // child matching action name exists + if (!cJSON_IsArray(subtree)) + visit_dispatch(actions[i], subtree, state); + else { + cJSON *elem; + cJSON_ArrayForEach(elem, subtree) + visit_dispatch(actions[i], elem, state); + } + } + } +} + +/* Call the appropriate callback for the the root node of tree, if any. Then + visit its children, if any. */ +void visit_dispatch(struct json_dispatch action, cJSON *tree, void *state) +{ + if (action.f != NULL) + action.f(tree, state); + if (action.children != NULL) + visit(action.children, tree, state); +} diff --git a/bin/json.h b/bin/json.h new file mode 100644 index 000000000..9675a7a3e --- /dev/null +++ b/bin/json.h @@ -0,0 +1,32 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains all functions that deal with JSON: OCI, CDI, and + friends. */ + +#define _GNU_SOURCE +#pragma once +#include "config.h" + +#include + +#include "core.h" +#include "misc.h" + +#include CJSON_H + + +/** Types **/ + +/* General CDI configuration. */ +struct cdi_config { + char **spec_dirs; // directories to search for CDI spec files + bool devs_all_p; // inject all devices found + char **devids; // user-requested devices +}; + + +/** Function prototypes **/ + +struct env_var *cdi_envs_get(const char *devid); +void cdi_init(struct cdi_config *cf); +void json_init(void); diff --git a/bin/mem.c b/bin/mem.c new file mode 100644 index 000000000..308358f2d --- /dev/null +++ b/bin/mem.c @@ -0,0 +1,460 @@ +/* libgc API + --------- + + See: + + https://hboehm.info/gc/gcinterface.html + https://github.com/ivmai/bdwgc/blob/57ccbcc/include/gc/gc.h#L459 + + The latter is more complete. + + libgc provides both upper-case, e.g. GC_MALLOC(), and lower-case, e.g. + GC_malloc(), versions of many functions. It’s not totally clear to me what + the separation principles are, though the vibe does seem to prefer the + upper-case versions. We use the upper-case when available. + + Zeroing newly-allocated memory + ------------------------------ + + Because we use a lot of zero-terminated data structures, it would be nice + for the allocation functions to just always return zeroed buffers. We also + want to not require libgc, i.e., we want to still be able to use malloc(3) + and realloc(3) under the hood. It’s easy to provide a zeroing + malloc(3)-workalike, and we do, but as far as I can tell, it’s impossible + to do so for realloc(3)-alike unless we either (1) maintain our own + allocation size tracking or (2) use highly non-portable code. Neither of + these seemed worth the effort and complexity. + + This is because, as it turns out, the length of an allocated buffer is a + more complicated notion than it seems. A buffer has *two* different + lengths: L1 is the size requested by the original caller, and L2 is the + size actually allocated; L2 ≥ L1. Neither are reliably available: + + * L1: The allocator can’t provide it, and while the caller had it at the + time of previous allocation, it might not have kept it. + + * L2: Not available from the libc allocator without fairly extreme + non-portability and/or difficult constraints [1], though libgc does + provide it with GC_size(). The caller never knew it. + + Suppose we call realloc() with a new length Lν, where Lν > L2 ≥ L1. To zero + the new part of the buffer, we must zero (L1,Lν], or (L2,Lν] if we assume + (L1,L2] are still zero from the initial malloc(), and leave prior bytes + untouched. But we don’t know either L1 or L2 reliably, so we’re hosed, + whether we call an upstream realloc() or malloc() an entirely new buffer, + then memcpy(3). + + I suspect this is why libc provides calloc(3) but not an equivalent for + realloc(3). + + [1]: https://stackoverflow.com/questions/1281686 */ + +#define _GNU_SOURCE +#include "config.h" + +#include +#include +#include +#include +#include + +#ifdef HAVE_GC +#include +#endif + +#include "mem.h" +#include "misc.h" + + +/** Macros **/ + +/** Types **/ + +/** Constants **/ + +/** Function prototytpes (private) **/ + +ssize_t kB(ssize_t byte_ct); + + +/** Globals **/ + +/* Note: All the memory statistics are signed “ssize_t” rather than the more + correct unsigned “size_t” so that subtractions are less error-prone (we + report lots of differences). We assume that memory usage is small enough + for this to not matter. */ + +/* Size of the stack, heap, and anonymous mmap(2) mappings at previous + ch_memory_log() call. */ +ssize_t stack_prev = 0; +ssize_t heap_prev = 0; +ssize_t anon_prev = 0; + +#ifdef HAVE_GC + +/* Note: The first four counters are from GC_prof_stats_s fields and have the + corresponding names. Total size of allocated blocks is derived. See gc.h. */ + +/* Total size of the heap. This includes “unmapped” bytes that libgc is + tracking but has given back to the OS, I assume to be re-requested from the + OS if needed. */ +ssize_t heapsize_prev = 0; + +/* Free bytes in the heap, both mapped and unmapped. */ +ssize_t free_prev = 0; + +/* Unmapped bytes (i.e., returned to the OS but still tracked by libgc) in the + heap. */ +ssize_t unmapped_prev = 0; + +/* Number of garbage collections done so far. */ +ssize_t gc_no_prev = 0; + +/* Total time spent doing garbage collection, in milliseconds. Corresponds to + GC_get_full_gc_total_time(). Note that because ch-run is single-threaded, + we do not report time spent collecting with the world stopped. */ +long time_collecting_prev = 0; + +#endif + + +/** Functions **/ + +/* Return a snprintf(3)-formatted string in a newly allocated buffer of + appropriate length. Exit on error. + + This function formats the string twice: Once to figure out how long the + formatted string is, and again to actually format the string. I’m not aware + of a better way to compute string length. (musl does it the same way; glibc + was too complicated for my patience in figuring it out.) + + An alternative would be to allocate a small buffer, try that, and if it’s + too small re-allocate and format again. For strings that fit, this would + save a formatting cycle at the cost of wasted memory and more code paths. + That didn’t seem like the right trade-off, esp. since short strings should + be the fastest to format. */ +char *ch_asprintf(const char *fmt, ...) +{ + va_list ap; + char *str; + + va_start(ap, fmt); + str = ch_vasprintf(fmt, ap); + va_end(ap); + + return str; +} + +/* Fork the process. In parent, return the PID of the child; in the child, + return 0. Cannot fail. + + The main purpose of this wrapper is to do an aggressive garbage collection + prior to fork(2) so the child is a small as possible. */ +pid_t ch_fork(void) +{ + pid_t child; + + ch_memory_log("fork"); + garbageinate("fkgc"); + + child = fork(); + Tf (child >= 0, "can't fork"); + + return child; +} + +/* free(3)-alike that does nothing. Don’t call it. Provided for libraries that + let us hook memory allocation and de-allocation, e.g. cJSON. */ +void ch_free_noop(void *p) +{ +} + +/* Return a new null-terminated string containing the next record from fp, + where records are delimited by delim (e.g., pass '\n' to get the next + line). If no more records available, return NULL. Exit on error. + + Unlike getdelim(3), the delimiter is *not* part of the returned string. + + Warnings: + + 1. Records cannot contain the zero byte, and behavior is undefined if fp + containes any zeros and delimiter is not '\0'. + + 2. The returned buffer is likely larger than needed. We assume wasting + this space is better than the overhead of realloc’ing down to a + precise size. */ +char *ch_getdelim(FILE *fp, char delim) +{ + size_t bytes_read = 0; + size_t buf_len = 8; // non-zero start avoids early frequent realloc + char *buf = ch_malloc(buf_len, false); + + while (true) { + int c = fgetc(fp); + if (c == EOF) + break; + bytes_read++; + if (bytes_read > buf_len) { // room for terminator ensured later + buf_len *= 2; + buf = ch_realloc(buf, buf_len, false); + } + buf[bytes_read-1] = c; + if (c == delim) + break; + } + + if (buf[bytes_read-1] == delim) { // found delimiter + buf[bytes_read-1] = '\0'; + } else if (feof(fp)) { // end-of-file + if (bytes_read == 0) // no record left + return NULL; + else { // record ends at EOF (no delimiter) + if (bytes_read >= buf_len) { + T_ (bytes_read == buf_len); + buf = ch_realloc(buf, buf_len + 1, false); + } + buf[bytes_read] = '\n'; + } + } else { // error + Te (0, "error reading file"); // don’t know filename here + } + + return buf; +} + +/* Allocate and return a new buffer of length size bytes. The initial contents + of the buffer are undefined. + + If pointerful, then the buffer may contain pointers. Otherwise, the caller + guarantees no pointers will ever be stored in the buffer. This allows + garbage collection optimizations. If unsure, say true. */ +void *ch_malloc(size_t size, bool pointerful) +{ + void *buf; + +#ifdef HAVE_GC + buf = pointerful ? GC_MALLOC(size) : GC_MALLOC_ATOMIC(size); +#else + (void)pointerful; // suppress warning + buf = malloc(size); +#endif + + T_ (buf); + return buf; +} + +/* Like ch_malloc(), but same API as malloc(3). Prefer ch_malloc(). This is + provided for libraries that let us hook memory allocation and + de-allocation, e.g. cJSON. */ +void *ch_malloc_pointerful(size_t size) +{ + return ch_malloc(size, true); +} + +/* Like ch_malloc(), but buffer contents are zeroed. */ +void *ch_malloc_zeroed(size_t size, bool pointerful) +{ + void *buf = ch_malloc(size, pointerful); + memset(buf, 0, size); + return buf; +} + +/* Shut down memory management. */ +void ch_memory_exit(void) +{ + ch_memory_log("exit"); +} + +/* Initialize memory management. We don’t log usage here because it’s called + before logging is up. */ +void ch_memory_init(void) +{ +#ifdef HAVE_GC + //GC_set_handle_fork(1); // I think the default mode is fine??? + GC_INIT(); + GC_start_performance_measurement(); +#endif +} + +/* Log stack and heap memory usage, and GC statistics if enabled, to stderr + and syslog if enabled. */ +void ch_memory_log(const char *when) +{ + FILE *fp; + char *line = NULL; + char *s; + ssize_t stack_len = 0, heap_len = 0, anon_len = 0; + ssize_t total_len, total_prev; +#ifdef HAVE_GC + struct GC_prof_stats_s ps; + ssize_t used, used_prev; + long time_collecting; +#endif + + /* Compute stack, heap, and anonymous mapping sizes. While awkward, AFAICT + this is the best available way to get these sizes. See proc_pid_maps(5). + Whitespace-separated (?) fields: + + 1. start (inclusive) and end (exclusive) addresses, in hex + 2. permissions, e.g. “r-xp” + 3. offset, in hex + 4. device major:minor, in hex? + 5. inode number, in decimal + 6. pathname */ + T_ (fp = fopen("/proc/self/maps", "r")); + while ((line = ch_getdelim(fp, '\n'))) { + int conv_ct; + void *start, *end; + char path[8] = { 0 }; // length must match format string! + conv_ct = sscanf(line, "%p-%p %*[rwxp-] %*x %*x:%*x %*u %7s", + &start, &end, path); + if (conv_ct < 2) { // will be 2 if path empty + WARNING("please report this bug: can't parse map: %d: \"%s\"", + conv_ct, line); + break; + } + if (strlen(path) == 0) + anon_len += end - start; + else if (!strcmp(path, "[stack]")) + stack_len += end - start; + else if (!strcmp(path, "[heap]")) + heap_len += end - start; + } + Z_ (fclose(fp)); + + // log the basics + total_len = stack_len + heap_len + anon_len; + total_prev = stack_prev + heap_prev + anon_prev; + s = ch_asprintf("mem: %s: " + "%zdkB %+zd (stac %zdkB %+zd, heap %zdkB %+zd, anon %zdkB %+zd)", + when, + kB(total_len), kB(total_len - total_prev), + kB(stack_len), kB(stack_len - stack_prev), + kB(heap_len), kB(heap_len - heap_prev), + kB(anon_len), kB(anon_len - anon_prev)); + stack_prev = stack_len; + heap_prev = heap_len; + anon_prev = anon_len; + DEBUG(s); +#ifdef ENABLE_SYSLOG + syslog(SYSLOG_PRI, "%s", s); +#endif + + // log GC stuff +#ifdef HAVE_GC + GC_get_prof_stats(&ps, sizeof(ps)); + time_collecting = GC_get_full_gc_total_time(); + // space + used = ps.heapsize_full - ps.free_bytes_full; + used_prev = heapsize_prev - free_prev; + s = ch_asprintf("gc: %s: " + "%zdkB %+zd (used %zdkB %+zd, free %zdkB %+zd, unmp %zdkB %+zd)", + when, + kB(ps.heapsize_full), kB(ps.heapsize_full - heapsize_prev), + kB(used), kB(used - used_prev), + kB(ps.free_bytes_full), kB(ps.free_bytes_full - free_prev), + kB(ps.unmapped_bytes), kB(ps.unmapped_bytes - unmapped_prev)); + heapsize_prev = ps.heapsize_full; + free_prev = ps.free_bytes_full; + unmapped_prev = ps.unmapped_bytes; + DEBUG(s); +#ifdef ENABLE_SYSLOG + syslog(SYSLOG_PRI, "%s", s); +#endif + // time + s = ch_asprintf("gc: " + "%s: %ld collections (%+ld) in %zdms (%+zd)", + when, + ps.gc_no, ps.gc_no - gc_no_prev, + time_collecting, time_collecting - time_collecting_prev); + gc_no_prev = ps.gc_no; + time_collecting_prev = time_collecting; + DEBUG(s); +#ifdef ENABLE_SYSLOG + syslog(SYSLOG_PRI, "%s", s); +#endif +#endif +} + +/* Change the size of allocated buffer p to size bytes. Like realloc(3), if p + is NULL, then this function is equivalent to ch_malloc(). Unlike free(3), + size may not be zero. + + If size is greater than the existing buffer length, the initial content of + new bytes is undefined. If size is less than the existing buffer length, + this function may be a no-op; i.e., it may be impossible to shrink a + buffer’s actual allocation. + + pointerful is as in ch_malloc(). If p is non-NULL, it must match the the + original allocation, though this is not validated. */ +void *ch_realloc(void *p, size_t size, bool pointerful) +{ + void *p_new; + + T_ (size > 0); + + if (p == NULL) + p_new = ch_malloc(size, pointerful); // no GC_REALLOC_ATOMIC() + else { +#ifdef HAVE_GC + p_new = GC_REALLOC(p, size); +#else + p_new = realloc(p, size); +#endif + } + + T_ (p_new); + return p_new; +} + +/* Return a copy of s in a newly allocated, pointerless buffer. Cannot fail. + + Note: Unlike strdup(3), ch_strdup() is only needed if you need to actually + modify the copy. It should not be used to simplify memory management. */ +char *ch_strdup(const char *s) +{ + char *dst; + +#ifdef HAVE_GC + dst = GC_STRDUP(s); +#else + dst = strdup(s); +#endif + + T_ (dst); + return dst; +} + +/* Like ch_asprintf(), but takes and consumes a va_list pointer. */ +char *ch_vasprintf(const char *fmt, va_list ap) +{ + va_list ap2; + int str_len; + char *str; // = ch_malloc(1024, false); + + va_copy(ap2, ap); + + T_ (0 <= (str_len = vsnprintf(NULL, 0, fmt, ap))); + str = ch_malloc(str_len + 1, false); + T_ (str_len == vsnprintf(str, str_len + 1, fmt, ap2)); + + va_end(ap2); + + return str; +} + +/* If linked with libgc, do a maximum-effort garbage collection; otherwise, do + nothing. Use when to tag memory logging. */ +void garbageinate(const char *when) +{ +#ifdef HAVE_GC + GC_gcollect_and_unmap(); + ch_memory_log(when); +#endif +} + +/* Convert a signed number of bytes to kilobytes (truncated) and return it. */ +ssize_t kB(ssize_t byte_ct) +{ + return byte_ct / 1024; +} diff --git a/bin/mem.h b/bin/mem.h new file mode 100644 index 000000000..63279691d --- /dev/null +++ b/bin/mem.h @@ -0,0 +1,25 @@ +/* Memory management routines. */ + +#define _GNU_SOURCE +#pragma once + +#include +#include +#include + +/** Function prototypes **/ + +char *ch_asprintf(const char *fmt, ...); +pid_t ch_fork(void); +void ch_free_noop(void *p); +char *ch_getdelim(FILE *fp, char delim); +void ch_memory_exit(void); +void ch_memory_init(void); +void ch_memory_log(const char *when); +void *ch_malloc(size_t size, bool pointerful); +void *ch_malloc_pointerful(size_t size); +void *ch_malloc_zeroed(size_t size, bool pointerful); +void *ch_realloc(void *p, size_t size, bool pointerful); +char *ch_strdup(const char *src); +char *ch_vasprintf(const char *fmt, va_list ap); +void garbageinate(const char *when); diff --git a/bin/ch_misc.c b/bin/misc.c similarity index 57% rename from bin/ch_misc.c rename to bin/misc.c index bdee7fa20..ee9d660c5 100644 --- a/bin/ch_misc.c +++ b/bin/misc.c @@ -1,6 +1,8 @@ /* Copyright © Triad National Security, LLC, and others. */ #define _GNU_SOURCE +#include "config.h" + #include #include #include @@ -16,8 +18,8 @@ #include #include -#include "config.h" -#include "ch_misc.h" +#include "mem.h" +#include "misc.h" /** Macros **/ @@ -32,10 +34,44 @@ #define SUPP_GIDS_MAX 128 +/** Constants **/ + +/* Text colors. Note leading escape characters (U+001B), which don’t always + show up depending on your viewer. + + In principle, we should be using a library for this, e.g. + terminfo(5). However, moderately thorough web searching suggests that + pretty much any modern terminal will support 256-color ANSI codes, and this + is way simpler [1]. Probably should coordinate these colors with the Python + code somehow. + + [1]: https://stackoverflow.com/a/3219471 */ +static const char COLOUR_CYAN_DARK[] = ""; +static const char COLOUR_CYAN_LIGHT[] = ""; +//static const char COLOUR_GRAY[] = ""; +static const char COLOUR_RED[] = ""; +static const char COLOUR_RED_BOLD[] = ""; +static const char COLOUR_RESET[] = ""; +static const char COLOUR_YELLOW[] = ""; +static const char *_LL_COLOURS[] = { COLOUR_RED_BOLD, // fatal + COLOUR_RED_BOLD, // stderr + COLOUR_RED, // warning + COLOUR_YELLOW, // info + COLOUR_CYAN_LIGHT, // verbose + COLOUR_CYAN_DARK, // debug + COLOUR_CYAN_DARK }; // trace +/* This lets us index by verbosity, which can be negative. */ +static const char **LL_COLOURS = _LL_COLOURS + 3; + + /** External variables **/ -/* Level of chatter on stderr. */ -enum log_level verbose; +/* If true, exit abnormally on fatal error. Set in ch-run.c during argument + parsing, so will always be default value before that. */ +bool abort_fatal = false; + +/* If true, use colored logging. Set in ch-run.c. */ +bool log_color_p = false; /* Path to host temporary directory. Set during command line processing. */ char *host_tmp = NULL; @@ -43,6 +79,9 @@ char *host_tmp = NULL; /* Username of invoking users. Set during command line processing. */ char *username = NULL; +/* Level of chatter on stderr. */ +enum log_level verbose; + /* List of warnings to be re-printed on exit. This is a buffer of shared memory allocated by mmap(2), structured as a sequence of null-terminated character strings. Warnings that do not fit in this buffer will be lost, though we @@ -56,6 +95,7 @@ char *warnings; size_t warnings_offset = 0; + /** Function prototypes (private) **/ void mkdir_overmount(const char *path, const char *scratch); @@ -74,11 +114,12 @@ char *argv_to_string(char **argv) char *s = NULL; for (size_t i = 0; argv[i] != NULL; i++) { - char *argv_, *x; + char *argv_; bool quote_p = false; // Max length is escape every char plus two quotes and terminating zero. - T_ (argv_ = calloc(2 * strlen(argv[i]) + 3, 1)); + // Initialize to zeroes so we don’t have to terminate string later. + argv_ = ch_malloc_zeroed(2 * strlen(argv[i]) + 3, false); // Copy to new string, escaping as we go. Note lots of fall-through. I'm // not sure where this list of shell meta-characters came from; I just @@ -121,27 +162,19 @@ char *argv_to_string(char **argv) } } - if (quote_p) { - x = argv_; - T_ (1 <= asprintf(&argv_, "\"%s\"", argv_)); - free(x); - } - - if (i != 0) { - x = s; - s = cat(s, " "); - free(x); - } - - x = s; - s = cat(s, argv_); - free(x); - free(argv_); + s = cats(5, s, i == 0 ? "" : " ", + quote_p ? "\"" : "", argv_, quote_p ? "\"" : ""); } return s; } +/* Return bool b as a string. */ +const char *bool_to_string(bool b) +{ + return (b ? "yes" : "no"); +} + /* Iterate through buffer “buf” of size “s” consisting of null-terminated strings and return the number of strings in it. Key assumptions: @@ -154,9 +187,9 @@ char *argv_to_string(char **argv) 3. The buffer contains no empty strings. These assumptions are consistent with the construction of the “warnings” - shared memory buffer, which is the main justification for this function. Note - that under these assumptions, the final byte in the buffer is guaranteed to - be null. */ + shared memory buffer, which is the main justification for this function. + Note that under these assumptions, the final byte in the buffer is + guaranteed to be null. */ int buf_strings_count(char *buf, size_t size) { int count = 0; @@ -182,49 +215,102 @@ bool buf_zero_p(void *buf, size_t size) return true; } -/* Concatenate strings a and b, then return the result. */ +/* Concatenate strings a and b into a newly-allocated buffer and return a + pointer to this buffer. */ char *cat(const char *a, const char *b) { - char *ret; - if (a == NULL) - a = ""; - if (b == NULL) - b = ""; - T_ (asprintf(&ret, "%s%s", a, b) == strlen(a) + strlen(b)); - return ret; + return cats(2, a, b); } -/* Like scandir(3), but (1) filter excludes “.” and “..”, (2) results are not - sorted, and (3) cannot fail (exits with an error instead). */ -int dir_ls(const char *path, struct dirent ***namelist) +/* Concatenate argc strings into a newly allocated buffer and return a pointer + to this buffer. If argc is zero, return the empty string. NULL pointers are + treated as empty strings. */ +char *cats(size_t argc, ...) { - int entry_ct; + char *ret, *next; + size_t ret_len; + char **argv; + size_t *argv_lens; + va_list ap; - entry_ct = scandir(path, namelist, dir_ls_filter, NULL); - Tf (entry_ct >= 0, "can't scan dir", path); - return entry_ct; + argv = ch_malloc(argc * sizeof(char *), true); + argv_lens = ch_malloc(argc * sizeof(size_t), false); + + // compute buffer size and convert NULLs to empty string + va_start(ap, argc); + ret_len = 1; // for terminator + for (int i = 0; i < argc; i++) + { + char *arg = va_arg(ap, char *); + if (arg == NULL) { + argv[i] = ""; + argv_lens[i] = 0; + } else { + argv[i] = arg; + argv_lens[i] = strlen(arg); + } + ret_len += argv_lens[i]; + } + va_end(ap); + + // copy strings + ret = ch_malloc(ret_len, false); + next = ret; + for (int i = 0; i < argc; i++) { + memcpy(next, argv[i], argv_lens[i]); + next += argv_lens[i]; + } + ret[ret_len-1] = '\0'; + + return ret; } -/* Return the number of entries in directory path, not including “.” and “..”; - i.e., the empty directory returns 0 despite them. */ -int dir_ls_count(const char *path) -{ - int ct; - struct dirent **namelist; +/* Return a newly-allocated, null-terminated list of filenames in directory + path that match fnmatch(3)-pattern glob, excluding “.” and “..”. For a list + of everything, pass "*" for glob. Leading dots *do* match “*”. - ct = dir_ls(path, &namelist); - for (size_t i = 0; i < ct; i++) - free(namelist[i]); - free(namelist); + We use readdir(3) rather than scandir(3) because the latter allocates + memory with malloc(3). */ +char **dir_glob(const char *path, const char *glob) +{ + DIR *dp; + int i; // index of next free array element + size_t alloc_ct = 16; + char **entries = ch_malloc(alloc_ct * sizeof(char *), true); + + Tf (dp = opendir(path), "can't open directory: %s", path); + i = 0; + while (true) { + struct dirent *entry; + int matchp; + errno = 0; + entry = readdir(dp); + if (entry == NULL) { + Zf (errno, "can’t read directory: %s", path); + break; // EOF + } + matchp = fnmatch(glob, entry->d_name, FNM_EXTMATCH); + if (matchp != 0) { + T_ (matchp == FNM_NOMATCH); // error? + continue; // no match, skip + } + if (i >= alloc_ct - 1) { + alloc_ct *= 2; + entries = ch_realloc(entries, alloc_ct * sizeof(char *), true); + } + entries[i] = entry->d_name; + i++; + } + entries[i] = NULL; + Zf (closedir(dp), "can't close directory: %s", path); - return ct; + return entries; } -/* scandir(3) filter that excludes “.” and “..”: Return 0 if e->d_name is one - of those strings, else 1. */ -int dir_ls_filter(const struct dirent *e) +/* Return the number of matches for glob in path. */ +int dir_glob_count(const char *path, const char *glob) { - return !(!strcmp(e->d_name, ".") || !strcmp(e->d_name, "..")); + return list_count(dir_glob(path, glob), sizeof(char *)); } /* Read the file listing environment variables at path, with records separated @@ -249,18 +335,14 @@ struct env_var *env_file_read(const char *path, int delim) vars = list_new(sizeof(struct env_var), 0); for (size_t line_no = 1; true; line_no++) { struct env_var var; - char *line = NULL; - size_t line_len = 0; // don't care but required by getline(3) + char *line; errno = 0; - if (-1 == getdelim(&line, &line_len, delim, fp)) { - if (errno == 0) // EOF - break; - else - Tf (0, "can't read: %s", path); - } - if (line[strlen(line) - 1] == '\n') // rm newline if present + line = ch_getdelim(fp, delim); + if (line == NULL) // EOF + break; + if (line[strlen(line) - 1] == (char)delim) // rm delimiter if present line[strlen(line) - 1] = 0; - if (line[0] == 0) // skip blank lines + if (line[0] == '\0') // skip blank lines continue; var = env_var_parse(line, path, line_no); list_append((void **)&vars, &var, sizeof(var)); @@ -270,37 +352,54 @@ struct env_var *env_file_read(const char *path, int delim) return vars; } +/* Return the value of environment variable name if set; otherwise, return + value_default instead. */ +char *env_get(const char *name, char *value_default) +{ + char *ret = getenv(name); + return ret ? ret : value_default; +} + + /* Set environment variable name to value. If expand, then further expand variables in value marked with "$" as described in the man page. */ void env_set(const char *name, const char *value, const bool expand) { - char *value_, *value_expanded; - bool first_written; + char *vwk = NULL; // modifiable copy of value // Walk through value fragments separated by colon and expand variables. - T_ (value_ = strdup(value)); - value_expanded = ""; - first_written = false; - while (true) { // loop executes ≥ once - char *fgmt = strsep(&value_, ":"); // NULL -> no more items - if (fgmt == NULL) - break; - if (expand && fgmt[0] == '$' && fgmt[1] != 0) { - fgmt = getenv(fgmt + 1); // NULL if unset - if (fgmt != NULL && fgmt[0] == 0) - fgmt = NULL; // convert empty to unset - } - if (fgmt != NULL) { // NULL -> omit from output - if (first_written) - value_expanded = cat(value_expanded, ":"); - value_expanded = cat(value_expanded, fgmt); - first_written = true; + if (expand) { + char *vwk_cur; // current location in vwk + char *vout = NULL; // output (expanded) string + bool first_out = false; // true after 1st output element written + vwk = ch_strdup(value); + vwk_cur = vwk; + while (true) { // loop executes ≥ once + char *elem = strsep(&vwk_cur, ":"); // NULL -> no more elements + if (elem == NULL) + break; + if (elem[0] == '$' && elem[1] != 0) { // looks like $VARIABLE + elem = getenv(elem + 1); // NULL if unset + if (elem != NULL && elem[0] == 0) // set but empty + elem = NULL; // convert to unset + } + if (elem != NULL) { // empty -> omit from output list + vout = cats(3, vout, first_out ? "" : ":", elem); + first_out = true; + } } + value = vwk; } // Save results. - VERBOSE("environment: %s=%s", name, value_expanded); - Z_ (setenv(name, value_expanded, 1)); + DEBUG("environment: %s=%s", name, value); + Z_ (setenv(name, value, 1)); +} + +void envs_set(const struct env_var *vars, const bool expand) +{ + for (size_t i = 0; vars[i].name != NULL; i++) + env_set(vars[i].name, vars[i].value, expand); } /* Remove variables matching glob from the environment. This is tricky, @@ -310,12 +409,12 @@ void env_set(const char *name, const char *value, const bool expand) O(n^2) search until no matches remain. Our approach is O(n): we build up a copy of environ, skipping variables - that match the glob, and then assign environ to the copy. (This is a valid - thing to do [2].) + that match the glob, and then assign environ to the copy. This is a valid + thing to do [2]. [1]: https://unix.stackexchange.com/a/302987 [2]: http://man7.org/linux/man-pages/man3/exec.3p.html */ -void env_unset(const char *glob) +void envs_unset(const char *glob) { char **new_environ = list_new(sizeof(char *), 0); for (size_t i = 0; environ[i] != NULL; i++) { @@ -323,9 +422,9 @@ void env_unset(const char *glob) int matchp; split(&name, &value, environ[i], '='); T_ (name != NULL); // environ entries must always have equals - matchp = fnmatch(glob, name, FNM_EXTMATCH); // extglobs if available + matchp = fnmatch(glob, name, FNM_EXTMATCH); // extglobs if available if (matchp == 0) { - VERBOSE("environment: unset %s", name); + DEBUG("environment: unset %s", name); } else { T_ (matchp == FNM_NOMATCH); *(value - 1) = '='; // rejoin line @@ -343,17 +442,15 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) { char *name, *value, *where; - if (path == NULL) { - T_ (where = strdup(line)); - } else { - T_ (1 <= asprintf(&where, "%s:%zu", path, lineno)); - } + if (path == NULL) + where = ch_strdup(line); + else + where = ch_asprintf("%s:%zu", path, lineno); // Split line into variable name and value. split(&name, &value, line, '='); Te (name != NULL, "can't parse variable: no delimiter: %s", where); Te (name[0] != 0, "can't parse variable: empty name: %s", where); - free(where); // for Tim // Strip leading and trailing single quotes from value, if both present. if ( strlen(value) >= 2 @@ -372,50 +469,116 @@ struct env_var env_var_parse(const char *line, const char *path, size_t lineno) list to the new location. *list can be NULL to initialize a new list. Return the new array size. - Note: ar must be cast, e.g. "list_append((void **)&foo, ...)". + Usage note: ar must be cast, e.g. "list_append((void **)&foo, ...)". + + Implementation note: We could round up the new size to the next power of + two for allocation purposes, which would reduce the number of realloc() + that actually change the size. However, many allocators do this type of + thing internally already, and that seems a better place for it. Warning: This function relies on all pointers having the same representation, which is true on most modern machines but is not guaranteed by the standard [1]. We could instead return the new value of ar rather than using an out parameter, which would avoid the double pointer and associated non-portability but make it easy for callers to create dangling - pointers, i.e., after "a = list_append(b, ...)", b will dangle. That - problem could in turn be avoided by returning a *copy* of the array rather - than a modified array, but then the caller has to deal with the original - array itself. It seemed to me the present behavior was the best trade-off. + pointers, i.e., after “a = list_append(b, ...)”, b will be invalid. This + isn’t just about memory leaks but also the fact that b points to an invalid + buffer that likely *looks* valid. [1]: http://www.c-faq.com/ptrs/genericpp.html */ void list_append(void **ar, void *new, size_t size) { - int ct; + size_t ct; T_ (new != NULL); - // count existing elements - if (*ar == NULL) - ct = 0; - else - for (ct = 0; !buf_zero_p((char *)*ar + ct*size, size); ct++) - ; + ct = list_count(*ar, size); + *ar = ch_realloc(*ar, (ct+2)*size, true); // existing + new + terminator + memcpy(*ar + ct*size, new, size); // append new (no overlap) + memset(*ar + (ct+1)*size, 0, size); // set new terminator +} - T_ (*ar = realloc(*ar, (ct+2)*size)); // existing + new + terminator - memcpy((char *)*ar + ct*size, new, size); // append new (no overlap) - memset((char *)*ar + (ct+1)*size, 0, size); // set new terminator +/* Copy the contents of list src onto the end of dest. */ +void list_cat(void **dst, void *src, size_t size) +{ + size_t ct_dst, ct_src; + T_ (src != NULL); + + ct_dst = list_count(*dst, size); + ct_src = list_count(src, size); + *dst = ch_realloc(*dst, (ct_dst+ct_src+1)*size, true); + memcpy(*dst + ct_dst*size, src, ct_src*size); // append src (no overlap) + memset(*dst + (ct_dst+ct_src)*size, 0, size); // set new terminator +} + +/* Return the number of elements of size size in list *ar, not including the + terminating zero element. */ +size_t list_count(void *ar, size_t size) +{ + size_t ct; + + if (ar == NULL) + return 0; + + for (ct = 0; !buf_zero_p((char *)ar + ct*size, size); ct++) + ; + return ct; } /* Return a pointer to a new, empty zero-terminated array containing elements of size size, with room for ct elements without re-allocation. The latter allows to pre-allocate an arbitrary number of slots in the list, which can - then be filled directly without testing the list's length for each one. + then be filled directly without testing the list’s length for each one. (The list is completely filled with zeros, so every position has a terminator after it.) */ void *list_new(size_t size, size_t ct) { void *list; - T_ (list = calloc(ct+1, size)); + T_ (size > 0); + T_ (list = ch_malloc_zeroed((ct+1) * size, true)); return list; } -/* If verbose, print uids and gids on stderr prefixed with where. */ +/* Split str into tokens delimited by delim (multiple adjacent delimiters are + treated as one). Copy each token into a newly-allocated string buffer, and + return these strings as a new list. + + The function accepts a single delimiter, not multiple like strtok(3). */ +void *list_new_strings(char delim, const char *str) +{ + char **list; + char *str_, *tok_state; + char delims[] = { delim, '\0' }; + size_t delim_ct = 0; + + // Count delimiters so we can allocate the right size list initially, + // avoiding one realloc() per delimiter. Note this does not account for + // adjacent delimiters and thus may overcount tokens, possibly wasting a + // small amount of memory. + for (int i = 0; str[i] != '\0'; i++) + delim_ct += (str[i] == delim ? 1 : 0); + + list = list_new(delim_ct + 1, sizeof(char *)); + + // Note: strtok_r(3)’s interface is rather awkward; see its man page. + str_ = ch_strdup(str); // so we can modify it + tok_state = NULL; + for (int i = 0; true; i++) { + char *tok; + tok = strtok_r(str_, delims, &tok_state); + if (tok == NULL) + break; + T_ (i < delim_ct + 1); // bounds check + list[i] = tok; + str_ = NULL; // only pass actual string on first call + } + + return list; +} + +/* If verbose enough, print uids and gids on stderr prefixed with where. + + FIXME: Should change to DEBUG(), but that will give the file/line within + this function, which we don’t want. */ void log_ids(const char *func, int line) { uid_t ruid, euid, suid; @@ -423,9 +586,11 @@ void log_ids(const char *func, int line) gid_t supp_gids[SUPP_GIDS_MAX]; int supp_gid_ct; - if (verbose >= 3) { + if (verbose >= LL_TRACE + 1) { // don’t bother b/c haven’t needed in ages Z_ (getresuid(&ruid, &euid, &suid)); Z_ (getresgid(&rgid, &egid, &sgid)); + if (log_color_p) + T_ (EOF != fputs(LL_COLOURS[LL_TRACE], stderr)); fprintf(stderr, "%s %d: uids=%d,%d,%d, gids=%d,%d,%d + ", func, line, ruid, euid, suid, rgid, egid, sgid); supp_gid_ct = getgroups(SUPP_GIDS_MAX, supp_gids); @@ -439,18 +604,48 @@ void log_ids(const char *func, int line) fprintf(stderr, "%d", supp_gids[i]); } fprintf(stderr, "\n"); + if (log_color_p) + T_ (EOF != fputs(COLOUR_RESET, stderr)); + Z_ (fflush(stderr)); } } -void test_logging(bool fail) { - TRACE("trace"); - DEBUG("debug"); - VERBOSE("verbose"); - INFO("info"); - WARNING("warning"); - if (fail) - FATAL("the program failed inexplicably (\"log-fail\" specified)"); - exit(0); +/* Set up logging. Note ch-run(1) specifies a bunch of color synonyms; this + translation happens during argument parsing.*/ +void logging_init(enum log_color_when when, enum log_test test) +{ + // set up colors + switch (when) { + case LL_COLOR_AUTO: + if (isatty(fileno(stderr))) + log_color_p = true; + else { + T_ (errno == ENOTTY); + log_color_p = false; + } + break; + case LL_COLOR_YES: + log_color_p = true; + break; + case LL_COLOR_NO: + log_color_p = false; + break; + case LL_COLOR_NULL: + T_ (0); // unreachable + break; + } + + // test logging + if (test >= LL_TEST_YES) { + TRACE("trace"); + DEBUG("debug"); + VERBOSE("verbose"); + INFO("info"); + WARNING("warning"); + if (test >= LL_TEST_FATAL) + FATAL("the program failed inexplicably (\"log-fail\" specified)"); + exit(0); + } } /* Create the directory at path, despite its parent not allowing write access, @@ -466,12 +661,12 @@ void mkdir_overmount(const char *path, const char *scratch) char *parent, *path2, *over, *path_dst; char *orig_dir = ".orig"; // resisted calling this .weirdal int entry_ct; - struct dirent **entries; + char **entries; VERBOSE("making writeable via symlink ranch: %s", path); - path2 = strdup(path); + path2 = ch_strdup(path); parent = dirname(path2); - T_ (1 <= asprintf(&over, "%s/%d", scratch, dir_ls_count(scratch) + 1)); + over = ch_asprintf("%s/%d", scratch, dir_glob_count(scratch, "*") + 1); path_dst = path_join(over, orig_dir); // bind-mounts @@ -483,25 +678,16 @@ void mkdir_overmount(const char *path, const char *scratch) "can't bind-mount: %s- > %s", over, parent); // symlink ranch - entry_ct = dir_ls(path_dst, &entries); + entries = dir_glob(path_dst, "*"); + entry_ct = list_count(entries, sizeof(entries[0])); DEBUG("existing entries: %d", entry_ct); for (int i = 0; i < entry_ct; i++) { - char * src = path_join(parent, entries[i]->d_name); - char * dst = path_join(orig_dir, entries[i]->d_name); - + char * src = path_join(parent, entries[i]); + char * dst = path_join(orig_dir, entries[i]); Zf (symlink(dst, src), "can't symlink: %s -> %s", src, dst); - - free(src); - free(dst); - free(entries[i]); } - free(entries); Zf (mkdir(path, 0755), "can't mkdir even after overmount: %s", path); - - free(path_dst); - free(over); - free(path2); } /* Create directories in path under base. Exit with an error if anything goes @@ -530,18 +716,17 @@ void mkdirs(const char *base, const char *path, char **denylist, TRACE("mkdirs: base: %s", basec); TRACE("mkdirs: path: %s", path); - for (size_t i = 0; denylist[i] != NULL; i++) + for (int i = 0; denylist[i] != NULL; i++) TRACE("mkdirs: deny: %s", denylist[i]); - pathw = cat(path, ""); // writeable copy - saveptr = NULL; // avoid warning (#1048; see also strtok_r(3)) + pathw = ch_strdup(path); // writeable copy + saveptr = NULL; // avoid warning (#1048; see also strtok_r(3)) component = strtok_r(pathw, "/", &saveptr); nextc = basec; next = NULL; while (component != NULL) { - next = cat(nextc, "/"); - next = cat(next, component); // canonical except for last component - TRACE("mkdirs: next: %s", next) + next = path_join(nextc, component); // canonical except for last + TRACE("mkdirs: next: %s", next); component = strtok_r(NULL, "/", &saveptr); // next NULL if current last if (path_exists(next, &sb, false)) { if (S_ISLNK(sb.st_mode)) { @@ -569,7 +754,7 @@ void mkdirs(const char *base, const char *path, char **denylist, Tf (0, "can't mkdir: %s", next); } nextc = next; // canonical b/c we just created last component as dir - TRACE("mkdirs: created: %s", nextc) + TRACE("mkdirs: created: %s", nextc); } } TRACE("mkdirs: done"); @@ -587,7 +772,7 @@ void msg(enum log_level level, const char *file, int line, int errno_, } noreturn void msg_fatal(const char *file, int line, int errno_, - const char *fmt, ...) + const char *fmt, ...) { va_list ap; @@ -595,52 +780,76 @@ noreturn void msg_fatal(const char *file, int line, int errno_, msgv(LL_FATAL, file, line, errno_, fmt, ap); va_end(ap); - exit(EXIT_FAILURE); + if (abort_fatal) + abort(); + else + exit(EXIT_FAILURE); } /* va_list form of msg(). */ void msgv(enum log_level level, const char *file, int line, int errno_, const char *fmt, va_list ap) { - char *message, *ap_msg; - - if (level > verbose) + // note: all components contain appropriate leading/trailing space + char *text_formatted; // caller’s message, formatted + char *level_prefix; // level prefix + char *errno_code; // errno code/number + char *errno_desc; // errno description + char *text_full; // complete text but w/o color codes + const char * colour; // ANSI codes for color + const char * colour_reset; // ANSI codes to reset color + + if (level > verbose) // not verbose enough; do nothing return; - T_ (1 <= asprintf(&message, "%s[%d]: ", - program_invocation_short_name, getpid())); + // Format caller message. + if (fmt == NULL) + text_formatted = "please report this bug"; // users should not see + else + text_formatted = ch_vasprintf(fmt, ap); - // Prefix for the more urgent levels. + // Prefix some of the levels. switch (level) { case LL_FATAL: - message = cat(message, "error: "); // "fatal" too morbid for users + level_prefix = "error: "; // "fatal" too morbid for users break; case LL_WARNING: - message = cat(message, "warning: "); + level_prefix = "warning: "; break; default: + level_prefix = ""; break; } - // Default message if not specified. Users should not see this. - if (fmt == NULL) - fmt = "please report this bug"; - - T_ (1 <= vasprintf(&ap_msg, fmt, ap)); - if (errno_) { - T_ (1 <= asprintf(&message, "%s%s: %s (%s:%d %d)", message, ap_msg, - strerror(errno_), file, line, errno_)); + // errno. + if (!errno_) { + errno_code = ""; + errno_desc = ""; } else { - T_ (1 <= asprintf(&message, "%s%s (%s:%d)", message, ap_msg, file, line)); + errno_code = cat(" ", strerrorname_np(errno_)); // FIXME: non-portable + errno_desc = ch_asprintf(": %s", strerror(errno_)); } - if (level == LL_WARNING) { - warnings_offset += string_append(warnings, message, WARNINGS_SIZE, - warnings_offset); - } - fprintf(stderr, "%s\n", message); + // Color. + if (log_color_p) { + colour = LL_COLOURS[level]; + colour_reset = COLOUR_RESET; + } else { + colour = ""; + colour_reset = ""; + }; + + // Format and print. + text_full = ch_asprintf("%s[%d]: %s%s%s (%s:%d%s)", + program_invocation_short_name, getpid(), + level_prefix, text_formatted, errno_desc, + file, line, errno_code); + fprintf(stderr, "%s%s%s\n", colour, text_full, colour_reset); if (fflush(stderr)) - abort(); // can't print an error b/c already trying to do that + abort(); // can’t print an error b/c already trying to do that + if (level == LL_WARNING) + warnings_offset += string_append(warnings, text_full, + WARNINGS_SIZE, warnings_offset); } /* Return true if the given path exists, false otherwise. On error, exit. If @@ -669,27 +878,23 @@ bool path_exists(const char *path, struct stat *statbuf, bool follow_symlink) /* Concatenate paths a and b, then return the result. */ char *path_join(const char *a, const char *b) { - char *ret; - T_ (a != NULL); T_ (strlen(a) > 0); T_ (b != NULL); T_ (strlen(b) > 0); - T_ (asprintf(&ret, "%s/%s", a, b) == strlen(a) + strlen(b) + 1); - - return ret; + return ch_asprintf("%s/%s", a, b); } /* Return the mount flags of the file system containing path, suitable for passing to mount(2). - This is messy because, the flags we get from statvfs(3) are ST_* while the + This is messy because the flags we get from statvfs(3) are ST_* while the flags needed by mount(2) are MS_*. My glibc has a comment in bits/statvfs.h - that the ST_* "should be kept in sync with" the MS_* flags, and the values + that the ST_* “should be kept in sync with” the MS_* flags, and the values do seem to match, but there are additional undocumented flags in there. - Also, the kernel contains a test "unprivileged-remount-test.c" that - manually translates the flags. Thus, I wasn't comfortable simply passing + Also, the kernel contains a test “unprivileged-remount-test.c” that + manually translates the flags. Thus, I wasn’t comfortable simply passing the output of statvfs(3) to mount(2). */ unsigned long path_mount_flags(const char *path) { @@ -723,17 +928,14 @@ unsigned long path_mount_flags(const char *path) | (sv.f_flag & ST_SYNCHRONOUS ? MS_SYNCHRONOUS : 0); } -/* Split path into dirname and basename. */ +/* Split path into dirname and basename. If dir and/or base is NULL, then skip + that output. */ void path_split(const char *path, char **dir, char **base) { - char *path2; - - T_ (path2 = strdup(path)); - T_ (*dir = strdup(dirname(path2))); - free(path2); - T_ (path2 = strdup(path)); - T_ (*base = strdup(basename(path2))); - free(path2); + if (dir != NULL) + *dir = dirname(ch_strdup(path)); + if (base != NULL) + *base = basename(ch_strdup(path)); } /* Return true if path is a subdirectory of base, false otherwise. Acts on the @@ -778,7 +980,7 @@ char *realpath_(const char *path, bool fail_ok) if (pathc == NULL) { if (fail_ok) { - T_ (pathc = strdup(path)); + pathc = ch_strdup(path); } else { Tf (false, "can't canonicalize: %s", path); } @@ -798,32 +1000,23 @@ void replace_char(char *s, char old, char new) /* Split string str at first instance of delimiter del. Set *a to the part before del, and *b to the part after. Both can be empty; if no token is present, set both to NULL. Unlike strsep(3), str is unchanged; *a and *b - point into a new buffer allocated with malloc(3). This has two - implications: (1) the caller must free(3) *a but not *b, and (2) the parts - can be rejoined by setting *(*b-1) to del. The point here is to provide an - easier wrapper for strsep(3). */ + point into a new buffer. Therefore, the parts can be rejoined by setting + *(*b-1) to del. The point here is to provide an easier wrapper for + strsep(3). */ void split(char **a, char **b, const char *str, char del) { - char *tmp; char delstr[2] = { del, 0 }; T_ (str != NULL); - tmp = strdup(str); - *b = tmp; + *b = ch_strdup(str); *a = strsep(b, delstr); if (*b == NULL) *a = NULL; } -/* Report the version number. */ -void version(void) -{ - fprintf(stderr, "%s\n", VERSION); -} - -/* Append null-terminated string “str” to the memory buffer “offset” bytes after - from the address pointed to by “addr”. Buffer length is “size” bytes. Return - the number of bytes written. If there isn’t enough room for the string, do - nothing and return zero. */ +/* Append null-terminated string “str” to the memory buffer “offset” bytes + after from the address pointed to by “addr”. Buffer length is “size” bytes. + Return the number of bytes written. If there isn’t enough room for the + string, do nothing and return zero. */ size_t string_append(char *addr, char *str, size_t size, size_t offset) { size_t written = strlen(str) + 1; @@ -834,23 +1027,32 @@ size_t string_append(char *addr, char *str, size_t size, size_t offset) return written; } +/* Report the version number. */ +void version(void) +{ + fprintf(stderr, "%s\n", VERSION); +} + /* Reprint messages stored in “warnings” memory buffer. */ void warnings_reprint(void) { size_t offset = 0; int warn_ct = buf_strings_count(warnings, WARNINGS_SIZE); - if (warn_ct > 0) - fprintf(stderr, "%s[%d]: warning: reprinting first %d warning(s)\n", - program_invocation_short_name, getpid(), warn_ct); - - while ( warnings[offset] != 0 - || (offset < (WARNINGS_SIZE - 1) && warnings[offset+1] != 0)) { - fputs(warnings + offset, stderr); - fputc('\n', stderr); - offset += strlen(warnings + offset) + 1; + if (warn_ct > 0) { + if (log_color_p) + T_ (EOF != fputs(LL_COLOURS[LL_WARNING], stderr)); + T_ (1 <= fprintf(stderr, "%s[%d]: reprinting first %d warning(s)\n", + program_invocation_short_name, getpid(), warn_ct)); + while ( warnings[offset] != 0 + || (offset < (WARNINGS_SIZE - 1) && warnings[offset+1] != 0)) { + T_ (EOF != fputs(warnings + offset, stderr)); + T_ (EOF != fputc('\n', stderr)); + offset += strlen(warnings + offset) + 1; + } + if (log_color_p) + T_ (EOF != fputs(COLOUR_RESET, stderr)); + if (fflush(stderr)) + abort(); // can't print an error b/c already trying to do that } - - if (fflush(stderr)) - abort(); // can't print an error b/c already trying to do that } diff --git a/bin/ch_misc.h b/bin/misc.h similarity index 81% rename from bin/ch_misc.h rename to bin/misc.h index f590a0890..2b0e16cd7 100644 --- a/bin/ch_misc.h +++ b/bin/misc.h @@ -5,9 +5,12 @@ libraries that ch_core requires. */ #define _GNU_SOURCE +#pragma once + #include #include #include +#include #include @@ -20,6 +23,11 @@ and hopefully others support the following extension. */ #define noreturn __attribute__ ((noreturn)) +/* Syslog facility and level we use. */ +#ifdef ENABLE_SYSLOG +#define SYSLOG_PRI (LOG_USER|LOG_INFO) +#endif + /* Size of “warnings” buffer, in bytes. We want this to be big enough that we don’t need to worry about running out of room. */ #define WARNINGS_SIZE (4*1024) @@ -66,35 +74,25 @@ #define Zf(x, ...) if (x) msg_fatal(__FILE__, __LINE__, errno, __VA_ARGS__) #define Ze(x, ...) if (x) msg_fatal(__FILE__, __LINE__, 0, __VA_ARGS__) -#define FATAL(...) msg_fatal( __FILE__, __LINE__, 0, __VA_ARGS__); -#define WARNING(...) msg(LL_WARNING, __FILE__, __LINE__, 0, __VA_ARGS__); -#define INFO(...) msg(LL_INFO, __FILE__, __LINE__, 0, __VA_ARGS__); -#define VERBOSE(...) msg(LL_VERBOSE, __FILE__, __LINE__, 0, __VA_ARGS__); -#define DEBUG(...) msg(LL_DEBUG, __FILE__, __LINE__, 0, __VA_ARGS__); -#define TRACE(...) msg(LL_TRACE, __FILE__, __LINE__, 0, __VA_ARGS__); +#define FATAL(...) msg_fatal( __FILE__, __LINE__, 0, __VA_ARGS__) +#define WARNING(...) msg(LL_WARNING, __FILE__, __LINE__, 0, __VA_ARGS__) +#define INFO(...) msg(LL_INFO, __FILE__, __LINE__, 0, __VA_ARGS__) +#define VERBOSE(...) msg(LL_VERBOSE, __FILE__, __LINE__, 0, __VA_ARGS__) +#define DEBUG(...) msg(LL_DEBUG, __FILE__, __LINE__, 0, __VA_ARGS__) +#define TRACE(...) msg(LL_TRACE, __FILE__, __LINE__, 0, __VA_ARGS__) /** Types **/ -enum env_action { ENV_END = 0, // terminate list of environment changes - ENV_SET_DEFAULT, // set by /ch/environment within image - ENV_SET_VARS, // set by list of variables - ENV_UNSET_GLOB }; // unset glob matches +#ifndef HAVE_COMPARISON_FN_T +typedef int (*comparison_fn_t) (const void *, const void *); +#endif struct env_var { char *name; char *value; }; -struct env_delta { - enum env_action action; - union { - int delim; // ENV_SET_DEFAULT - struct env_var *vars; // ENV_SET_VARS - char *glob; // ENV_UNSET_GLOB - } arg; -}; - enum log_level { LL_FATAL = -3, LL_STDERR = -2, LL_WARNING = -1, @@ -103,12 +101,23 @@ enum log_level { LL_FATAL = -3, LL_DEBUG = 2, LL_TRACE = 3 }; +enum log_color_when { LL_COLOR_NULL = 0, + LL_COLOR_AUTO, + LL_COLOR_YES, + LL_COLOR_NO }; + +enum log_test { LL_TEST_NONE = 0, + LL_TEST_YES = 1, + LL_TEST_FATAL = 2 }; + /** External variables **/ -extern enum log_level verbose; +extern bool abort_fatal; +extern bool log_color_p; extern char *host_tmp; extern char *username; +extern enum log_level verbose; extern char *warnings; extern size_t warnings_offset; @@ -116,19 +125,26 @@ extern size_t warnings_offset; /** Function prototypes **/ char *argv_to_string(char **argv); +const char *bool_to_string(bool b); int buf_strings_count(char *str, size_t s); bool buf_zero_p(void *buf, size_t size); char *cat(const char *a, const char *b); -int dir_ls(const char *path, struct dirent ***namelist); -int dir_ls_count(const char *path); -int dir_ls_filter(const struct dirent *e); +char *cats(size_t argc, ...); +char **dir_glob(const char *path, const char *glob); +int dir_glob_count(const char *path, const char *glob); struct env_var *env_file_read(const char *path, int delim); +char *env_get(const char *name, char *value_default); void env_set(const char *name, const char *value, const bool expand); -void env_unset(const char *glob); +void envs_set(const struct env_var *envs, const bool expand); +void envs_unset(const char *glob); struct env_var env_var_parse(const char *line, const char *path, size_t lineno); void list_append(void **ar, void *new, size_t size); +void list_cat(void **dst, void *src, size_t size); +size_t list_count(void *ar, size_t size); +void *list_new_strings(char delim, const char *s); void *list_new(size_t size, size_t ct); void log_ids(const char *func, int line); +void logging_init(enum log_color_when when, enum log_test test); void test_logging(bool fail); void mkdirs(const char *base, const char *path, char **denylist, const char *scratch); diff --git a/bin/seccomp.c b/bin/seccomp.c new file mode 100644 index 000000000..bf620a0e3 --- /dev/null +++ b/bin/seccomp.c @@ -0,0 +1,261 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains the seccomp filter for root emulation. */ + +#define _GNU_SOURCE +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "core.h" +#include "hook.h" +#include "mem.h" + + +/** Macros **/ + +/* On some distros (e.g., CentOS 7), some of the architecture numbers are + missing. The workaround is to use the numbers I have on Debian Bullseye. + The reason I (Reid) feel moderately comfortable doing this is how militant + Linux is about not changing the userspace API. */ +#ifndef AUDIT_ARCH_AARCH64 +#define AUDIT_ARCH_AARCH64 0xC00000B7u // undeclared on CentOS 7 +#undef AUDIT_ARCH_ARM // uses undeclared EM_ARM on CentOS 7 +#define AUDIT_ARCH_ARM 0x40000028u +#endif + +/* Special values for seccomp tables. These must be negative to avoid clashing + with real syscall numbers (note zero is often a valid syscal number). */ +#define NR_NON -1 // syscall does not exist on architecture +#define NR_END -2 // end of table + +/** Constants **/ + +/* Architectures we support for seccomp. Order matches the table below. */ +int SECCOMP_ARCHS[] = { AUDIT_ARCH_AARCH64, // arm64 + AUDIT_ARCH_ARM, // arm32 + AUDIT_ARCH_I386, // x86 (32-bit) + AUDIT_ARCH_PPC64LE, // PPC + AUDIT_ARCH_S390X, // s390x + AUDIT_ARCH_X86_64, // x86-64 + NR_END }; + +/* System call numbers that we fake with seccomp (by doing nothing and + returning success). Some processors can execute multiple architectures + (e.g., 64-bit Intel CPUs can run both x64-64 and x86 code), and a process’ + architecture can even change (if you execve(2) binary of different + architecture), so we can’t just use the build host’s architecture. + + I haven’t figured out how to gather these system call numbers + automatically, so they are compiled from [1, 2, 3]. See also [4] for a more + general reference. + + NOTE: The total number of faked syscalls (i.e., non-zero entries below) + must be somewhat less than 256. I haven’t computed the exact limit. There + will be an assertion failure at runtime if this is exceeded. + + WARNING: Keep this list consistent with the ch-image(1) man page! + + [1]: https://chromium.googlesource.com/chromiumos/docs/+/HEAD/constants/syscalls.md#Cross_arch-Numbers + [2]: https://github.com/strace/strace/blob/v4.26/linux/powerpc64/syscallent.h + [3]: https://github.com/strace/strace/blob/v6.6/src/linux/s390x/syscallent.h + [4]: https://unix.stackexchange.com/questions/421750 */ +int FAKE_SYSCALL_NRS[][6] = { + // arm64 arm32 x86 PPC64 s390x x86-64 + // ------ ------ ------ ------ ------ ------ + { 91, 185, 185, 184, 185, 126 }, // capset + { NR_NON, 182, 182, 181, 212, 92 }, // chown + { NR_NON, 212, 212, NR_NON, NR_NON, NR_NON }, // chown32 + { 55, 95, 95, 95, 207, 93 }, // fchown + { NR_NON, 207, 207, NR_NON, NR_NON, NR_NON }, // fchown32 + { 54, 325, 298, 289, 291, 260 }, // fchownat + { NR_NON, 16, 16, 16, 198, 94 }, // lchown + { NR_NON, 198, 198, NR_NON, NR_NON, NR_NON }, // lchown32 + { 104, 347, 283, 268, 277, 246 }, // kexec_load + { 152, 139, 139, 139, 216, 123 }, // setfsgid + { NR_NON, 216, 216, NR_NON, NR_NON, NR_NON }, // setfsgid32 + { 151, 138, 138, 138, 215, 122 }, // setfsuid + { NR_NON, 215, 215, NR_NON, NR_NON, NR_NON }, // setfsuid32 + { 144, 46, 46, 46, 214, 106 }, // setgid + { NR_NON, 214, 214, NR_NON, NR_NON, NR_NON }, // setgid32 + { 159, 81, 81, 81, 206, 116 }, // setgroups + { NR_NON, 206, 206, NR_NON, NR_NON, NR_NON }, // setgroups32 + { 143, 71, 71, 71, 204, 114 }, // setregid + { NR_NON, 204, 204, NR_NON, NR_NON, NR_NON }, // setregid32 + { 149, 170, 170, 169, 210, 119 }, // setresgid + { NR_NON, 210, 210, NR_NON, NR_NON, NR_NON }, // setresgid32 + { 147, 164, 164, 164, 208, 117 }, // setresuid + { NR_NON, 208, 208, NR_NON, NR_NON, NR_NON }, // setresuid32 + { 145, 70, 70, 70, 203, 113 }, // setreuid + { NR_NON, 203, 203, NR_NON, NR_NON, NR_NON }, // setreuid32 + { 146, 23, 23, 23, 213, 105 }, // setuid + { NR_NON, 213, 213, NR_NON, NR_NON, NR_NON }, // setuid32 + { NR_END }, // end +}; +int FAKE_MKNOD_NRS[] = + { NR_NON, 14, 14, 14, 14, 133 }; +int FAKE_MKNODAT_NRS[] = + { 33, 324, 297, 288, 290, 259 }; + + +/** Function prototypes (private) **/ + +void iw(struct sock_fprog *p, int i, + uint16_t op, uint32_t k, uint8_t jt, uint8_t jf); + + +/** Functions **/ + +/* Prestart hook to set up the fake-syscall seccomp(2) filter. This computes + and installs a long-ish but fairly simple BPF program to implement the + filter. To understand this rather hairy language: + + 1. https://man7.org/training/download/secisol_seccomp_slides.pdf + 2. https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html + 3. https://elixir.bootlin.com/linux/latest/source/samples/seccomp */ +void hook_seccomp_install(struct container *c, void *d) +{ + int arch_ct = sizeof(SECCOMP_ARCHS)/sizeof(SECCOMP_ARCHS[0]) - 1; + int syscall_cts[arch_ct]; + struct sock_fprog p = { 0 }; + int ii, idx_allow, idx_fake, idx_mknod, idx_mknodat, idx_next_arch; + // Lengths of certain instruction groups. These are all obtained manually + // by counting below, violating DRY. We could automate these counts, but it + // seemed like the cost of extra buffers and code to do that would exceed + // that of maintaining the manual counts. + int ct_jump_start = 4; // ld arch & syscall nr, arch test, end-of-arch jump + int ct_mknod_jump = 2; // jump table handling for mknod(2) and mknodat(2) + int ct_mknod = 2; // mknod(2) handling + int ct_mknodat = 6; // mknodat(2) handling + + // Count how many syscalls we are going to fake in the standard way. We + // need this to compute the right offsets for all the jumps. + for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { + p.len += ct_jump_start + ct_mknod_jump; + syscall_cts[ai] = 0; + for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { + bool syscall_p = FAKE_SYSCALL_NRS[si][ai] != NR_NON; + syscall_cts[ai] += syscall_p; + p.len += syscall_p; // syscall jump table entry + } + } + + // Initialize program buffer. + p.len += ( 1 // return allow + + 1 // return fake success + + ct_mknod // mknod(2) handling + + ct_mknodat); // mknodat(2) handling + DEBUG("seccomp: filter program has %d instructions", p.len); + p.filter = ch_malloc(p.len * sizeof(struct sock_filter), false); + + // Return call addresses. Allow needs to come first because we’ll jump to + // it for unknown architectures. + idx_allow = p.len - 2 - ct_mknod - ct_mknodat; + idx_fake = p.len - 1 - ct_mknod - ct_mknodat; + idx_mknod = p.len - ct_mknod - ct_mknodat; + idx_mknodat = p.len - ct_mknodat; + + // Build a jump table for each architecture. The gist is: if architecture + // matches, fall through into the jump table, otherwise jump to the next + // architecture (or ALLOW for the last architecture). + ii = 0; + idx_next_arch = -1; // avoid warning on some compilers + for (int ai = 0; SECCOMP_ARCHS[ai] != NR_END; ai++) { + int jump; + idx_next_arch = ii + syscall_cts[ai] + ct_jump_start + ct_mknod_jump; + // load arch into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, arch), 0, 0); + // jump to next arch if arch doesn't match + jump = idx_next_arch - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, SECCOMP_ARCHS[ai], 0, jump); + // load syscall number into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr), 0, 0); + // jump table of syscalls + for (int si = 0; FAKE_SYSCALL_NRS[si][0] != NR_END; si++) { + int nr = FAKE_SYSCALL_NRS[si][ai]; + if (nr != NR_NON) { + jump = idx_fake - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, nr, jump, 0); + } + } + // jump to mknod(2) handling (add even if syscall not implemented to + // make the instruction counts simpler) + jump = idx_mknod - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNOD_NRS[ai], jump, 0); + // jump to mknodat(2) handling + jump = idx_mknodat - ii - 1; + T_ (jump <= 255); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, FAKE_MKNODAT_NRS[ai], jump, 0); + // unfiltered syscall, jump to allow (limit of 255 doesn’t apply to JA) + jump = idx_allow - ii - 1; + iw(&p, ii++, BPF_JMP|BPF_JA, jump, 0, 0); + } + T_ (idx_next_arch == idx_allow); + + // Returns. (Note that if we wanted a non-zero errno, we’d bitwise-or with + // SECCOMP_RET_ERRNO. But because fake success is errno == 0, we don’t need + // a no-op “| 0”.) + T_ (ii == idx_allow); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); + T_ (ii == idx_fake); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); + + // mknod(2) handling. This just loads the file mode and jumps to the right + // place in the mknodat(2) handling. + T_ (ii == idx_mknod); + // load mode argument into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, args[1]), 0, 0); + // jump to mode test + iw(&p, ii++, BPF_JMP|BPF_JA, 1, 0, 0); + + // mknodat(2) handling. + T_ (ii == idx_mknodat); + // load mode argument into accumulator + iw(&p, ii++, BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, args[2]), 0, 0); + // jump to fake return if trying to create a device. + iw(&p, ii++, BPF_ALU|BPF_AND|BPF_K, S_IFMT, 0, 0); // file type only + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFCHR, 2, 0); + iw(&p, ii++, BPF_JMP|BPF_JEQ|BPF_K, S_IFBLK, 1, 0); + // returns + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ALLOW, 0, 0); + iw(&p, ii++, BPF_RET|BPF_K, SECCOMP_RET_ERRNO, 0, 0); + + // Install filter. Use prctl(2) rather than seccomp(2) for slightly greater + // compatibility (Linux 3.5 rather than 3.17) and because there is a glibc + // wrapper. + T_ (ii == p.len); // next instruction now one past the end of the buffer + Z_ (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &p)); + DEBUG("seccomp: see contributor's guide to disassemble"); + + // Test filter. This will fail if the kernel executes the call (because we + // are not really privileged and the arguments are bogus) or succeed if + // filter handles it. We selected it over something more naturally in the + // filter, e.g. setuid(2), because (1) no container process should ever use + // it and (2) it’s unlikely to be emulated by a smarter filter in the + // future, i.e., it won’t silently start doing something. + Zf (syscall(SYS_kexec_load, 0, 0, NULL, 0), + "seccomp root emulation failed (is your architecture supported?)"); +} + +/* Helper function to write seccomp-bpf programs. */ +void iw(struct sock_fprog *p, int i, + uint16_t op, uint32_t k, uint8_t jt, uint8_t jf) +{ + p->filter[i] = (struct sock_filter){ op, jt, jf, k }; +} + diff --git a/bin/seccomp.h b/bin/seccomp.h new file mode 100644 index 000000000..821a646ee --- /dev/null +++ b/bin/seccomp.h @@ -0,0 +1,10 @@ +/* Copyright © Triad National Security, LLC, and others. + + This interface contains the seccomp filter for root emulation. */ + +#define _GNU_SOURCE +#pragma once + +#include "core.h" + +void hook_seccomp_install(struct container *c, void *d); diff --git a/configure.ac b/configure.ac index f05b4cee0..121efd593 100644 --- a/configure.ac +++ b/configure.ac @@ -74,6 +74,74 @@ AC_CONFIG_FILES([Makefile test/Makefile]) +### Our macros ###################################################### + +# Macro to validate executable versions. Arguments: +# +# $1 name of variable containing executable name or absolute path +# $2 minimum version +# $3 append to $1 to make shell pipeline to get actual version only +# (e.g., without program name) +# +# This macro is not able to determine if a program exists, only whether its +# version is sufficient. ${!1} (i.e, the value of the variable whose name is +# stored in $1) must be either empty, an absolute path to an executable, or +# the name of a program in $PATH. A prior macro such as AX_WITH_PROG can be +# used to ensure this condition. +# +# If ${!1} is an absolute path, and that file isn’t executable, error out. If +# it’s something other than an absolute path, assume it’s the name of a +# program in $PATH; if not, the behavior is undefined but not good (FIXME). +# +# Post-conditions: +# +# 1. If ${!1} is non-empty and the version reported by the program is +# greater than or equal to the minimum, ${!1} is unchanged. If ${!1} is +# empty or reported version is insufficient, ${!1} is the empty string. +# This lets you test version sufficiency by whether ${!1} is empty. +# +# 2. $1_VERSION_NOTE contains a brief explanatory note. +# +AC_DEFUN([CH_CHECK_VERSION], [ + AS_VAR_PUSHDEF([prog], [$1]) + AS_IF([test -n "$prog"], [ + # ${!1} is non-empty + AS_CASE([$prog], + # absolute path; check if executable + [/*], [AC_MSG_CHECKING([if $prog is executable]) + AS_IF([test -e "$prog"], + [AC_MSG_RESULT([ok])], + [AC_MSG_RESULT([no]) + AC_MSG_ERROR([must be executable])])]) + AC_MSG_CHECKING([if $prog version >= $2]) + vact=$($prog $3) + AX_COMPARE_VERSION([$2], [le], [$vact], [ + AC_SUBST([$1_VERSION_NOTE], ["ok ($vact)"]) + AC_MSG_RESULT([ok ($vact)]) + ], [ + AC_SUBST([$1_VERSION_NOTE], ["too old ($vact)"]) + AC_MSG_RESULT([too old ($vact)]) + AS_UNSET([$1]) + ]) + ], [ + # ${!} is empty + AC_SUBST([$1_VERSION_NOTE], ["not found"]) + AS_UNSET([$1]) + ]) + AS_VAR_POPDEF([prog]) +]) + +# Macro to validate that $1 is a directory (or a symlink to one). If not, exit +# with error, prefixed with $2. +AC_DEFUN([CH_REQUIRE_DIR], [ + AC_MSG_CHECKING([whether $1 is a directory]) + AS_IF([test -d "$1"], + [AC_MSG_RESULT(yes)], + [AC_MSG_RESULT(no) + AC_MSG_ERROR([$2: not a directory: $1])]) +]) + + ### Options ################################################################## # Note: Variables must match option, e.g. --disable-foo-bar => enable_foo_bar. @@ -120,6 +188,7 @@ AC_ARG_ENABLE([test], AS_HELP_STRING([--disable-test], [test suite]), [], [enable_test=yes]) +# --with-seccomp AC_ARG_WITH([seccomp], AS_HELP_STRING([--with-seccomp=(yes|no)], [support for --seccomp])) @@ -139,6 +208,69 @@ AS_CASE([$with_seccomp], [*], # anything else [AC_MSG_ERROR([invalid --with-seccomp arg: $with_seccomp])]) +# --with-gc +AC_ARG_WITH([gc], + AS_HELP_STRING([--with-gc=@<:@yes|no@:>@], + [enable conservative garbage collection with libgc])) +AS_CASE([$with_gc], + [yes], + [want_gc=yes + need_gc=yes], + [no], + [want_gc=no + need_gc=no], + [''], + [want_gc=yes + need_gc=no], + [*], + [AC_MSG_ERROR([--with-gc: bad argument: $with_gc])]) + +AC_ARG_WITH([gc-include], + AS_HELP_STRING([--with-gc-include=DIR], + [directory containing gc.h (if not in defaults)])) +AS_IF([test -n "$with_gc_include"], + [inc_libgc=$with_gc_include + CH_REQUIRE_DIR([$inc_gc], [--with-gc-include])]) + +AC_ARG_WITH([gc-lib], + AS_HELP_STRING([--with-gc-lib=DIR], + [directory containing libgc.so (if not in defaults)])) +AS_IF([test -n "$with_gc_lib"], + [lib_json=$with_gc_lib + CH_REQUIRE_DIR([$lib_gc], [--with-gc])]) + +# --with-json +AC_ARG_WITH([json], + AS_HELP_STRING([--with-json=@<:@yes|no@:>@], + [enable JSON features by linking with libcjson])) +AS_CASE([$with_json], + [yes], # --with-json=yes or --with-json + [want_json=yes + need_json=yes], + [no], # --with-json=no or --without-json + [want_json=no + need_json=no], + [''], # neither --with-json nor --without-json specified + [want_json=yes + need_json=no], + [*], # unknown argument + [AC_MSG_ERROR([--with-json: bad argument: $with_json])]) + +AC_ARG_WITH([json-include], + AS_HELP_STRING([--with-json-include=DIR], + [directory containing cJSON.h (if not in defaults)])) +AS_IF([test -n "$with_json_include"], + [inc_json=$with_json_include + CH_REQUIRE_DIR([$inc_json], [--with-json-include])]) + +AC_ARG_WITH([json-lib], + AS_HELP_STRING([--with-json-lib=DIR], + [directory containing libcjson.so (if not in defaults)])) +AS_IF([test -n "$with_json_lib"], + [lib_json=$with_json_lib + CH_REQUIRE_DIR([$lib_json], [--with-json-lib])]) + +# --with-libsquashfuse AC_ARG_WITH([libsquashfuse], AS_HELP_STRING([--with-libsquashfuse=@<:@yes|no|PATH@:>@], [whether to link with libsquashfuse])) @@ -174,145 +306,49 @@ AC_ARG_WITH([sphinx-python], [sphinx_python='']) -### Feature test macros ###################################################### - -# Macro to validate executable versions. Arguments: -# -# $1 name of variable containing executable name or absolute path -# $2 minimum version -# $3 append to $1 to make shell pipeline to get actual version only -# (e.g., without program name) -# -# This macro is not able to determine if a program exists, only whether its -# version is sufficient. ${!1} (i.e, the value of the variable whose name is -# stored in $1) must be either empty, an absolute path to an executable, or -# the name of a program in $PATH. A prior macro such as AX_WITH_PROG can be -# used to ensure this condition. -# -# If ${!1} is an absolute path, and that file isn’t executable, error out. If -# it’s something other than an absolute path, assume it’s the name of a -# program in $PATH; if not, the behavior is undefined but not good (FIXME). -# -# Post-conditions: -# -# 1. If ${!1} is non-empty and the version reported by the program is -# greater than or equal to the minimum, ${!1} is unchanged. If ${!1} is -# empty or reported version is insufficient, ${!1} is the empty string. -# This lets you test version sufficiency by whether ${!1} is empty. -# -# 2. $1_VERSION_NOTE contains a brief explanatory note. -# -AC_DEFUN([CH_CHECK_VERSION], [ - AS_VAR_PUSHDEF([prog], [$1]) - AS_IF([test -n "$prog"], [ - # ${!1} is non-empty - AS_CASE([$prog], - # absolute path; check if executable - [/*], [AC_MSG_CHECKING([if $prog is executable]) - AS_IF([test -e "$prog"], - [AC_MSG_RESULT([ok])], - [AC_MSG_RESULT([no]) - AC_MSG_ERROR([must be executable])])]) - AC_MSG_CHECKING([if $prog version >= $2]) - vact=$($prog $3) - AX_COMPARE_VERSION([$2], [le], [$vact], [ - AC_SUBST([$1_VERSION_NOTE], ["ok ($vact)"]) - AC_MSG_RESULT([ok ($vact)]) - ], [ - AC_SUBST([$1_VERSION_NOTE], ["too old ($vact)"]) - AC_MSG_RESULT([too old ($vact)]) - AS_UNSET([$1]) - ]) - ], [ - # ${!} is empty - AC_SUBST([$1_VERSION_NOTE], ["not found"]) - AS_UNSET([$1]) - ]) - AS_VAR_POPDEF([prog]) -]) - - ### C compiler ############################################################### # Need a C99 compiler. (See https://stackoverflow.com/a/28558338.) AC_PROG_CC -# Set up CFLAGS. -ch_cflags='-std=c99 -Wall' -AS_IF([test -n "$lib_libsquashfuse"], - [ch_cflags="$ch_cflags -I$inc_libsquashfuse -L$lib_libsquashfuse" - # Without this, clang fails with “error: argument unused during - # compilation” on the -L. GCC ignores it. - ch_cflags="$ch_cflags -Wno-unused-command-line-argument"]) +# Set up CFLAGS. -Wno-unused-command-line-argument is for clang, which fails +# with an error if -L is present for non-linking stages. It seemed easier to +# add it unconditionally rather than maintain conditionals about which +# compiler and which libraries. +ch_cflags='-std=c99 -Wall -Wno-unused-command-line-argument' AS_IF([test $use_werror = yes], [ch_cflags="$ch_cflags -Werror"]) +AS_IF([test -n "$inc_gc"], # -L$lib_gc added below + [ch_cflags="$ch_cflags -I$inc_gc"]) +AS_IF([test -n "$inc_json"], # -L$lib_json added below + [ch_cflags="$ch_cflags -I$inc_json"]) +AS_IF([test -n "$lib_libsquashfuse"], + [ch_cflags="$ch_cflags -I$inc_libsquashfuse -L$lib_libsquashfuse"]) -AX_CHECK_COMPILE_FLAG([$ch_cflags], [ - CFLAGS="$CFLAGS $ch_cflags" -], [ - AC_MSG_ERROR([no suitable C99 compiler found]) -]) +AX_CHECK_COMPILE_FLAG([$ch_cflags], [], + [AC_MSG_ERROR([no suitable C99 compiler found])]) AS_IF([test "$CC" = icc], [AC_MSG_ERROR([icc not supported (see PR @%:@481)])]) ### ch-run required ########################################################## -# Only ch-run needs any kind of interesting library stuff; this variable holds -# the library arguments we need. This also requires us to use AC_CHECK_LIB -# instead of the (recommended by docs) AC_SEARCH_LIBS, because that adds -# things to LIBS, which we don’t want because it’s applied to all executables. -CH_RUN_LIBS= - -# asprintf(3) -# -# You can do this with AC_CHECK_FUNC or AC_CHECK_FUNCS, but those macros call -# the function with no arguments. This causes a warning for asprintf() for -# some compilers (and I have no clue why others accept it); see issue #798. -# Instead, try to build a small test program that calls asprintf() correctly. -AC_MSG_CHECKING([for asprintf in libc]) -AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ - #define _GNU_SOURCE - #include - #include - - int main(void) - { - char *p; - if (asprintf(&p, "WEIRD AL YANKOVIC\n") >= 0) - free(p); - return 0; - } - ]])], - [AC_MSG_RESULT([yes])], - [AC_MSG_RESULT([no]) - AC_MSG_ERROR([asprintf(3) not found; please report this bug])]) +# Note: We link both ch-run and ch-checkns with all the shared libraries, +# despite the latter using much less, depending on the compiler to omit +# libraries that aren’t actually used (gcc does this) or just not caring that +# extra libraries are linked. # argp_parse(3), which is included with glibc but not other libc’s, e.g. musl. -AC_MSG_CHECKING([for argp_parse in libc]) -AC_LINK_IFELSE([AC_LANG_SOURCE([[ - #include - - int main(void) - { - argp_parse(0, 1, NULL, 0, 0, 0); - return 0; - } - ]])], - [AC_MSG_RESULT([yes])], # built-in, no further action - [AC_MSG_RESULT([no]) # try external libargp - AC_CHECK_LIB( - [argp], [argp_parse], - [CH_RUN_LIBS="-largp $CH_RUN_LIBS"], - [AC_MSG_ERROR([argp_parse(3) not found; please report this bug])])]) +# In the latter case, we need an external libargp. +AC_SEARCH_LIBS(argp_parse, argp, [], + [AC_MSG_ERROR([argp_parse(3) not found; please report this bug])]) # pthreads; needed for “ch-run --join”. AX_PTHREAD -# POSIX IPC lives in librt. -AC_CHECK_LIB([rt], [shm_open], [CH_RUN_LIBS="-lrt $CH_RUN_LIBS"], [ - AC_MSG_ERROR([shm_open(3) not found]) -]) +# POSIX IPC sometimes lives in librt. +AC_SEARCH_LIBS(shm_open, rt, [], + [AC_MSG_ERROR([shm_open(3) not found; please report this bug])]) # User namespaces AC_MSG_CHECKING([if in chroot]) # https://unix.stackexchange.com/a/14346 @@ -339,6 +375,9 @@ AC_RUN_IFELSE([AC_LANG_SOURCE([[ [AC_MSG_ERROR([cross-compilation not supported])]) AC_MSG_RESULT($have_userns) + +### ch-run optional ########################################################## + # overlayfs AC_DEFUN([CH_OVERLAY_C], [[ #define _GNU_SOURCE @@ -421,9 +460,6 @@ AS_IF([test $enable_impolite_checks = yes], [AC_MSG_ERROR([cross-compilation not supported])])]) AC_MSG_RESULT($have_tmpfs_xattrs) - -### ch-run optional ########################################################## - # FNM_EXTMATCH is a GNU extension to support extended globs in fnmatch(3). AC_CHECK_DECL(FNM_EXTMATCH, [have_fnm_extmatch=yes], @@ -431,6 +467,54 @@ AC_CHECK_DECL(FNM_EXTMATCH, [[#define _GNU_SOURCE #include ]]) +# libgc. Note that we don’t try to ensure the header we find matches the +# library we find. Hopefully that’s not a problem. +AS_IF([test $want_gc = yes], [ + AC_SEARCH_LIBS(GC_malloc, gc, + [have_libgc=yes + AS_IF([test -n "$lib_gc"], + [ch_ldflags="-Wl,-rpath=$lib_gc $ch_ldflags"])], + [have_libgc=no]) + AC_CHECK_HEADER([gc.h], + [have_gc_h=yes], + [have_gc_h=no]) +], [have_libgc=no + have_gc_h=no]) +# Error out if needed but not found. +AS_IF([test $have_libgc = yes && test $have_gc_h = yes], + [have_gc=yes], + [have_gc=no]) +AS_IF([test $need_gc = yes && test $have_gc = no], + [AC_MSG_ERROR([--with-gc=yes but libgc not found])]) + +# cJSON. Also do not check this header matches the library we find. +AS_IF([test $want_json = yes], [ + AC_SEARCH_LIBS(cJSON_ParseWithLength, cjson, + [have_libcjson=yes + AS_IF([test -n "$lib_json"], + [ch_ldflags="-Wl,-rpath=$lib_json $ch_ldflags"])], + [have_libcjson=no]) + # The include file installs by default to “$PREFIX/include/cjson/cJSON.h”, + # but --with-json-include shouldn’t require a “cjson” subdirectory and it + # seemed impossible to document that concisely anyway. Thereforre, try both + # and define a macro. Double quotes support bundling it with Charliecloud. + AC_CHECK_HEADER([cJSON.h], + [have_cjson_h=yes + cjson_h='"cJSON.h"'], + [AC_CHECK_HEADER([cjson/cJSON.h], + [have_cjson_h=yes + cjson_h='"cjson/cJSON.h"'], + [cjson_h='not found' + have_cjson_h=no])]) +], [have_libcjson=no + have_cjson_h=no]) +# Error out if needed but not found. +AS_IF([test $have_libcjson = yes && test $have_cjson_h = yes], + [have_json=yes], + [have_json=no]) +AS_IF([test $need_json = yes && test $have_json = no], + [AC_MSG_ERROR([--with-json=yes but cJSON not found])]) + # Should we build seccomp? AC_MSG_CHECKING([for seccomp filter support]) AC_RUN_IFELSE([AC_LANG_SOURCE([[ @@ -501,18 +585,30 @@ AS_IF([test $want_libsquashfuse = yes], [ [AC_MSG_ERROR([need pkg-config to find libfuse3; try --with-libsquashfuse=no or see issue @%:@1844])]) AS_IF([pkg-config --exists fuse3], [ have_libfuse3=yes - CFLAGS="$CFLAGS $(pkg-config --cflags fuse3)" + ch_cflags="$ch_cflags $(pkg-config --cflags fuse3)" + # add -lfuse3 to LIBS (we already know it is available) + AC_SEARCH_LIBS(fuse_session_new, fuse3, [], + [AC_MSG_ERROR([libfuse3 found but not found; please report this bug])]) # libsquashfuse? - AC_CHECK_LIB([squashfuse_ll], [sqfs_ll_mount], - [have_libsquashfuse_ll=yes], - [have_libsquashfuse_ll=no]) - # ll.h? + AC_SEARCH_LIBS(sqfs_ll_mount, squashfuse_ll, + [have_libsquashfuse_ll=yes], + [have_libsquashfuse_ll=no]) + # ll.h? This check is hairy because AC_CHECK_HEADERS tries to actually + # compile a program that includes the header, but that won’t work for ll.h + # without the -I for fuse3 we got from pkg-config. We’re also advised not + # to change $CFLAGS within configure.ac [1]. I couldn’t figure out a way + # to get the -I into AC_CHECK_HEADER without changing $CFLAGS, so I just + # put it back to follow the advice as best I could. + # [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.65/html_node/Preset-Output-Variables.html + cflags_old=$CFLAGS + CFLAGS="$ch_cflags $CFLAGS" AC_CHECK_HEADER([squashfuse/ll.h], [have_ll_h=yes], [have_ll_h=no], [#define SQFS_CONFIG_H #define FUSE_USE_VERSION 32 - ]) # see comment in ch_fuse.c regarding these defines + ]) # see comment in fuse.c regarding these defines + CFLAGS=$cflags_old ], [have_libfuse3=no]) ]) @@ -524,9 +620,7 @@ AS_IF([ test $want_libsquashfuse = yes \ && test $have_ll_h = yes], [have_libsquashfuse=yes AS_IF([test -n "$lib_libsquashfuse"], - [rpath_libsquashfuse=-Wl,-rpath=$lib_libsquashfuse], - [rpath_libsquashfuse=]) - CH_RUN_LIBS="-lsquashfuse_ll -lfuse3 $rpath_libsquashfuse $CH_RUN_LIBS"], + [ch_ldflags="-Wl,-rpath=$lib_libsquashfuse $ch_ldflags"])], [have_libsquashfuse=no]) AS_IF([ test $need_libsquashfuse = yes \ && test $have_libsquashfuse = no], @@ -764,14 +858,13 @@ CH_CHECK_VERSION([WGET], [$vmin_wget], [--version | head -1 | cut -d' ' -f3]) # the output Makefile. It *does not* create a Make variable. # # 4. AC_DEFINE(foo, value, comment) #define’s the preprocessor symbol foo to -# value in config.h. (Supposedly value and comment are optional but I got -# warnings doing that.) So this is how you make configure values -# available in C code (as macros, not variables). Typically you would -# define something or not (allowing #ifdef), rather than always define to -# true or false (which would require #if). +# value in config.h. (Supposedly, value and comment are optional but I +# got warnings doing that.) Importantly, value is not expanded. This is +# good for either defining or not defining a C macro; you can then use +# #ifdef to gate on that macro. # -# 5. AC_DEFINE_UNQUOTES adds some extra transformations to the above. I -# didn’t quite follow. +# 5. AC_DEFINE_UNQUOTED also expands value. This is good for defining a C +# macro to the actual value of some configure variable. # # Below are all the variables we want available outside configure. @@ -783,21 +876,29 @@ AS_IF([test $enable_syslog = yes], [AC_DEFINE([ENABLE_SYSLOG], [1], [log to syslog])]) AM_CONDITIONAL([ENABLE_TEST], [test $enable_test = yes]) -AC_SUBST([CH_RUN_LIBS]) -AC_SUBST([PYTHON_SHEBANG]) -AC_SUBST([SPHINX]) +AC_SUBST(AM_CFLAGS, [$ch_cflags]) +AC_SUBST(AM_LDFLAGS, [$ch_ldflags]) +AC_SUBST(PYTHON_SHEBANG) +AC_SUBST(SPHINX) -AS_IF([test $have_overlayfs = yes], - [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) -AS_IF([test $have_tmpfs_xattrs = yes], - [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) AS_IF([test $have_fnm_extmatch = yes], [AC_DEFINE([HAVE_FNM_EXTMATCH], [1], [extended globs supported])]) -AS_IF([test $have_seccomp = yes], - [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) +AS_IF([test $have_gc = yes], + [AC_DEFINE([HAVE_GC], [1], [enable garbage collection])]) +AM_CONDITIONAL([HAVE_JSON], [test $have_json = yes]) +AS_IF([test $have_json = yes], + [AC_DEFINE([HAVE_JSON], [1], [enable JSON features]) + AC_DEFINE_UNQUOTED([CJSON_H], [$cjson_h], [cJSON.h location])]) AM_CONDITIONAL([HAVE_LIBSQUASHFUSE], [test $have_libsquashfuse = yes]) AS_IF([test $have_libsquashfuse = yes], [AC_DEFINE([HAVE_LIBSQUASHFUSE], [1], [link with libsquashfuse])]) +AS_IF([test $have_overlayfs = yes], + [AC_DEFINE([HAVE_OVERLAYFS], [1], [unprivileged overlayfs])]) +AS_IF([test $have_seccomp = yes], + [AC_DEFINE([HAVE_SECCOMP], [1], [seccomp supported])]) +AM_CONDITIONAL([HAVE_SECCOMP], [test $have_seccomp = yes]) +AS_IF([test $have_tmpfs_xattrs = yes], + [AC_DEFINE([HAVE_TMPFS_XATTRS], [1], [tmpfs user xattrs])]) @@ -811,6 +912,20 @@ AS_IF([ test $have_userns = yes], [have_ch_run=yes], [have_ch_run=no]) +AS_IF([ test $want_gc = yes], + [note_libgc=$have_libgc + note_gc_h=$have_gc_h], + [note_libgc='not tested' + note_gc_h='not tested']) + +AS_IF([ test $want_json = yes], + [note_libcjson=$have_libcjson + AS_IF([test $have_cjson_h = yes], + [note_cjson_h="yes, $cjson_h"], + [note_cjson_h=no])], + [note_libcjson='not tested' + note_cjson_h='not tested']) + # image builders AS_IF([ test $enable_ch_image = yes \ @@ -946,10 +1061,22 @@ Building Charliecloud test suite ... ${enable_test} required: - C99 compiler ... ${CC} ${CFLAGS} - - optional: - extended glob patterns in --unset-env ... ${have_fnm_extmatch} + C99 compiler ... ${CC} + \$CFLAGS ... ${ch_cflags} + \$LDFLAGS ... ${ch_ldflags} + library args ... ${LIBS} + + extended glob patterns in --unset-env: ${have_fnm_extmatch} + + garbage collection: ${have_gc} + enabled ... ${want_gc} + libgc ... ${note_libgc} + gc.h ... ${note_gc_h} + + JSON features: ${have_json} + enabled ... ${want_json} + libcjson ... ${note_libcjson} + cJSON.h ... ${note_cjson_h} ch-run(1) internal SquashFS mounting: ${have_libsquashfuse} enabled ... ${want_libsquashfuse} diff --git a/doc/cdi-nvidia.json b/doc/cdi-nvidia.json new file mode 100644 index 000000000..0bc419d09 --- /dev/null +++ b/doc/cdi-nvidia.json @@ -0,0 +1,36 @@ +{ + "cdiVersion": "0.5.0", + "kind": "nvidia.com/gpu", + "devices": [ { + "name": "foo", + "containerEdits": { + "deviceNodes": [ { "path": "/dev/nvidia0" }, + { "path": "/dev/dri/card0" } ], + "hooks": [ { "hookName": "createContainer", + "path": "/usr/bin/nvidia-ctk", + "args": [ "nvidia-ctk", + "hook", "create-symlinks", + "--link", "../card0::/dev/dri/by-path/pci-0000:07:00.0-card", + ] } ] } } ] } + "containerEdits": { + "env": [ "NVIDIA_VISIBLE_DEVICES=void" ], + "deviceNodes": [ { "path": "/dev/nvidia-modeset" }, + { "path": "/dev/nvidiactl" } ], + "mounts": [ + { "hostPath": "/run/nvidia-fabricmanager/socket", + "containerPath": "/run/nvidia-fabricmanager/socket", + "options": [ "ro", "nosuid", "nodev", "bind", "noexec" ] }, + { "hostPath": "/usr/bin/nvidia-smi", + "containerPath": "/usr/bin/nvidia-smi", + "options": [ "ro", "nosuid", "nodev", "bind" ] }, + { "hostPath": "/usr/lib/x86_64-linux-gnu/libcuda.so.535.161.08", + "containerPath": "/usr/lib/x86_64-linux-gnu/libcuda.so.535.161.08", + "options": [ "ro", "nosuid", "nodev", "bind" ] } ] + "hooks": [ + { "hookName": "createContainer", + "path": "/usr/bin/nvidia-ctk", + "args": [ + "nvidia-ctk", + "hook", "update-ldcache", + "--folder", "/usr/lib/x86_64-linux-gnu" ] } ] } +} diff --git a/doc/ch-run.rst b/doc/ch-run.rst index 2771078e4..8622a3b4e 100644 --- a/doc/ch-run.rst +++ b/doc/ch-run.rst @@ -56,6 +56,40 @@ mounting SquashFS images with FUSE. :code:`-c`, :code:`--cd=DIR` Initial working directory in container. + :code:`--cdi-dirs=PATHS` + Colon-separated list of directories to search for CDI JSON specifications. + Default: :code:`CH_RUN_CDI_DIRS` if set, otherwise + :code:`/etc/cdi:/var/run/cdi`. + + :code:`--color[=WHEN]` + Color logging output by log level when :code:`WHEN`: + + * By default, or if :code:`WHEN` is :code:`auto`, :code:`tty`, + :code:`if-tty`: use color if standard error is a TTY; otherwise, + don’t use color. + + * If :code:`WHEN` is :code:`yes`, :code:`always`, or :code:`force`; or + if :code:`--color` is specified without an argument: always use + color. + + * If :code:`WHEN` is :code:`no`, :code:`never`, or :code:`none`: never + use color. + + This uses ANSI color codes without checking any terminal databases, which + should work on all modern terminals. + + :code:`-d`, :code:`--devices` + Inject all CDI devices for which a specification is found. Implies + :code:`--write-fake`. + + :code:`--device=DEV` + Inject CDI device :code:`DEV`, either (1) a filename, if it starts with a + slash (:code:`/`) or dot (:code:`.`), e.g. :code:`/etc/cdi/nvidia.json`, + or (2) a CDI selector for a list of devices in a CDI specification file, + e.g. :code:`nvidia.com/gpu`. Specific devices may not be selected, e.g. + :code:`nvidia.com/gpu=1:0` is invalid (see below for why). Implies + :code:`--write-fake`. Can be repeated. + :code:`--env-no-expand` Don’t expand variables when using :code:`--set-env`. @@ -118,11 +152,6 @@ mounting SquashFS images with FUSE. This is intended for use by :code:`ch-image(1)` when building images; see that man page for a detailed discussion. - :code:`-t`, :code:`--private-tmp` - By default, the host’s :code:`/tmp` (or :code:`$TMPDIR` if set) is - bind-mounted at container :code:`/tmp`. If this is specified, a new - :code:`tmpfs` is mounted on the container’s :code:`/tmp` instead. - :code:`--set-env`, :code:`--set-env=FILE`, :code:`--set-env=VAR=VALUE` Set environment variables with newline-separated file (:code:`/ch/environment` within the image if not specified) or on the @@ -131,6 +160,11 @@ mounting SquashFS images with FUSE. :code:`--set-env0`, :code:`--set-env0=FILE`, :code:`--set-env0=VAR=VALUE` Like :code:`--set-env`, but file is null-byte separated. + :code:`-t`, :code:`--private-tmp` + By default, the host’s :code:`/tmp` (or :code:`$TMPDIR` if set) is + bind-mounted at container :code:`/tmp`. If this is specified, a new + :code:`tmpfs` is mounted on the container’s :code:`/tmp` instead. + :code:`-u`, :code:`--uid=UID` Run as user :code:`UID` within container. @@ -345,6 +379,7 @@ Caveats: * Many of the arguments given to the race losers, such as the image path and :code:`--bind`, will be ignored in favor of what was given to the winner. + .. _ch-run_overlay: Writeable overlay with :code:`--write-fake` @@ -375,39 +410,221 @@ requires kernel support. Specifically: and thus is not helpful for unprivileged containers.) +Injecting host “devices” with Container Device Interface (CDI) +============================================================== + +Overview of CDI +--------------- + +`Container Device Interface (CDI) +`_ +is an emerging `Cloud Native Computing Foundation (CNCF) +`_ standard to specify how “devices” are made available +to containers. Importantly, a CDI *device* is not a hardware gadget nor a +device file but rather a set of container modifications to be done before +invoking the user command. It’s intended to make devices (in the usual sense +of hardware gadgets) available inside containers but is quite flexible. A CDI +device can specify multiple device files, environment variables, mounts, and +more. Christopher Desiniotis gave a good talk at Container Plumbing Days 2024 +introducing CDI (`slides +`_, +`video `_). + +CDI devices are described in JSON *specification files*, which are declarative +except they provide for arbitrary hook programs. However, Charliecloud treats +them as fully declarative by interpreting hooks as a declarative statement +rather than a program to be run (brittle, but works for now). This +declarativeness has a significant advantage over OCI hooks, because we have a +clear description of what needs to be done rather than needing to run opaque +programs as hooks. + +Another advantage of CDI is that it’s largely orthogonal to OCI. While the +specifications have a strong OCI framing, this is largely an artifact of the +exposition style rather than a core notion. + +Here is an example spec file: + +.. literalinclude:: cdi-nvidia.json + :language: JSON + +This declares: + +#. A single CDI device called :code:`nvidia.com/gpu=foo`, comprising: + + #. Two device files to be made available in the container, + :code:`/dev/nvidia0` and :code:`/dev/dri/card0`. + + #. One symlink to create inside the container, + :code:`/dev/by-path/pci-0000:07:00.0-card` → :code:`../card0`. + +#. A set of container changes to be made once regardless of which devices are + selected (this example has one, but real spec files have several), + comprising: + + #. One environment variable to set, :code:`NVIDIA_VISIBLE_DEVICES`. + + #. Two device files to be made available in the container, + :code:`/dev/nvidia-modeset` and :code:`/dev/nvidiactl`. + + #. A socket (:code:`/run/nvidia-fabricmanager/socket`), executable + (:code:`nvidia-smi`), and shared library + (:code:`libcuda.so.535.161.08`) to be bind-mounted into the + container. + + #. Run the *host* :code:`ldconfig` to update the *container* linker cache, + scanning only container directory :code:`/usr/lib/x86_64-linux-gnu`. + +Charliecloud’s CDI implementation +--------------------------------- + +Charliecloud has some differences from other container implementations in how +this spec file is interpreted, but the results (working CDI devices) should be +the same. These are: + +#. All CDI devices available to the user normally are also available in the + container. For example, some implementations allow + :code:`--device=nvidia.com/gpu=foo`, which puts only the GPU named + :code:`foo` in the container, but :code:`ch-run` accepts only + :code:`--device=nvidia.com/gpu` (and similarly in + :code:`CH_RUN_CDI_DEFAULT`). This is because the host :code:`/dev` is + bind-mounted into Charliecloud containers, so there is no need to deal with + individual device files. + +#. Hooks are interpreted declaratively rather than running the specified + program. This is because we have not yet encountered any hooks that are + both useful under Charliecloud and do a task that merits an external + program. See below for details on individual hooks. + +#. Only bind mounts are implemented, because unprivileged mount namespaces + can’t mount much that is meaningful, and we haven’t seen any other mount + types yet. + +#. Charliecloud minimizes the number of bind mounts to avoid bloating the + container filesystem tree. (The spec file for one of our not-that-large + systems declares 47 mounts!) We do this by bind-mounting each filesystem + represented in a host path once and then symlinking into it for the + declared bind mounts. + +Selecting devices +----------------- + +:code:`ch-run` must do two things to make CDI devices available: (1) locate +appropriate specification files and (2) select which kinds of CDI devices to +inject. We assume further that the most common use case is to inject all +available CDI devices. The design of Charliecloud’s CDI user interface follows +from these principles. + +TL;DR: The intended most common usage is simply :code:`ch-run -d` to inject +all available CDI devices, using prior configuration by users or admins. + +Available spec files are those in the colon-separated list of directories in +:code:`--cdi-dirs=DIRS` if given, otherwise in :code:`CH_RUN_CDI_DIRS`, +otherwise :code:`/etc/cdi:/var/run/cdi` as required by the standard. + +The option :code:`--devices` (plural) or :code:`-d` then injects all devices +found in all spec files in these directories. + +Individual CDI device kinds can be selected with :code:`--device=DEV` +(singular), where :code:`DEV` is a device identifier. If it identifier starts +with slash (:code:`/`) or dot (:code:`.`), the identifier is a path to a JSON +CDI spec file, and all devices in that file are injected (e.g., +:code:`--device=./foo.json`). Otherwise, it is a CDI device kind with no +device name(s) (e.g., :code:`--device=nvidia.com/gpu`). The option can be +repeated to inject multiple device kinds. + +Importantly, both :code:`--device` and :code:`--devices` imply +:code:`--write-fake` (:code:`-W`) so the container image can be written. + Environment variables -===================== +--------------------- -:code:`ch-run` leaves environment variables unchanged, i.e. the host -environment is passed through unaltered, except: +Injecting a CDI device may require setting environment variables, as declared +in the spec file. These environment changes are executed in the order that +that CDI command line options appear on the command line relative to other +user-specified environment options, e.g. :code:`--set-env` and +:code:`--unset-env`. See :ref:`ch-run_environment-variables` below for +details. -* by default (:code:`--home` not specified), :code:`HOME` is set to - :code:`/root`, if it exists, and :code:`/` otherwise. -* limited tweaks to avoid significant guest breakage; -* user-set variables via :code:`--set-env`; -* user-unset variables via :code:`--unset-env`; and -* set :code:`CH_RUNNING`. +Hooks +------ -This section describes these features. +Behavior summary +~~~~~~~~~~~~~~~~ -The default tweaks happen first, then :code:`--set-env` and -:code:`--unset-env` in the order specified on the command line, and then -:code:`CH_RUNNING`. The two options can be repeated arbitrarily many times, -e.g. to add/remove multiple variable sets or add only some variables in a -file. +Presently, CDI hooks fall into three categories for Charliecloud: -Default behavior ----------------- +#. **Known hooks that we need**, with behavior emulated internally (i.e, we do + what the hook does, adapted for Charliecloud, rather than running it). -By default, :code:`ch-run` makes the following environment variable changes: +#. **Known hooks that we don’t need**; we ignore these quietly (i.e., logged but + a level hidden by default). + +#. **Unknown hooks.** We warn about these, because they need to be either moved + into one of the first to categories or actually run. (That is, we’re still + figuring out what’s needed for Charliecloud here.) + +The next two sections document known hooks. + +.. note:: + + `nVidia Container Toolkit + `_ + CDI hooks can be spelled either `either + `_ + :code:`nvidia-ctk hook` (two words) or :code:`nvidia-ctk-hook` (one word). + We treat the two spellings the same. + +Emulated hooks +~~~~~~~~~~~~~~ + +#. :code:`nvidia-ctk-hook update-ldcache` . This updates the container’s + linker cache (i.e., :code:`/etc/ld.so.cache`), `notably using + `_ + the *host’s* :code:`ldconfig`. For now at least, we instead use the + *container’s* :code:`ldconfig`, the reasoning being that (1) the + container’s linker updating its own cache is lower-risk compatibility wise + and (2) it seems unlikely that an image would be compatible with nVidia + libraries and have a linker cache but no :code:`ldconfig` executable. + + If the image has no :code:`ldconfig`, :code:`ch-run` exits with an error + and the container does not run. This indicates the assumption above is + false, so please report this error as a bug. + +Ignored hooks +~~~~~~~~~~~~~ + +#. :code:`nvidia-ctk-hook create-symlinks`. This creates one or more symlinks. + In our experience, the links created already exist in the host’s + :code:`/dev` or are created by :code:`ldconfig(8)`. + +#. :code:`nvidia-ctk-hook chmod`. This changes file permissions, but in + unprivileged Charliecloud containers, the invoking user will already have + access to all appropriate files. -:code:`$CH_RUNNING` - Set to :code:`Weird Al Yankovic`. While a process can figure out that it’s - in an unprivileged container and what namespaces are active without this - hint, that can be messy, and there is no way to tell that it’s a - *Charliecloud* container specifically. This variable makes such a test - simple and well-defined. (**Note:** This variable is unaffected by - :code:`--unset-env`.) + +.. _ch-run_environment-variables: + +Environment variables +===================== + +Unlike most other implementations, :code:`ch-run`’s baseline for the container +environment is to pass through the host environment unaltered. From this +starting point, the environment is altered in this order: + +#. :code:`$HOME`, :code:`$PATH`, and :code:`$TMPDIR` are adjusted to avoid + common breakage (see below). + +#. User-specified changes are executed in the order they appear on the command + line (i.e., :code:`-d`/:code:`--devices`, :code:`--device`, + :code:`--set-env`, and :code:`--unset-env`, some of which can appear + multiple times). + +#. :code:`$CH_RUNNING` is set. + +Built-in environment changes +---------------------------- + +Prior to user changes, i.e. can be altered by the user: :code:`$HOME` If :code:`--home` is specified, then your home directory is bind-mounted @@ -418,13 +635,12 @@ By default, :code:`ch-run` makes the following environment variable changes: is unchanged.) :code:`$PATH` - Newer Linux distributions replace some root-level directories, such as - :code:`/bin`, with symlinks to their counterparts in :code:`/usr`. - - Some of these distributions (e.g., Fedora 24) have also dropped :code:`/bin` - from the default :code:`$PATH`. This is a problem when the guest OS does - *not* have a merged :code:`/usr` (e.g., Debian 8 “Jessie”). Thus, we add - :code:`/bin` to :code:`$PATH` if it’s not already present. + We append :code:`/bin` to :code:`$PATH` if it’s not already present. This is + because newer Linux distributions replace some root-level directories, such + as :code:`/bin`, with symlinks to their counterparts in :code:`/usr`. Some + of these distributions (e.g., Fedora 24) have also dropped :code:`/bin` from + the default :code:`$PATH`. This is a problem when the guest OS does *not* + have a merged :code:`/usr` (e.g., Debian 8 “Jessie”). Further reading: @@ -437,6 +653,15 @@ By default, :code:`ch-run` makes the following environment variable changes: made available in the guest at :code:`/tmp` unless :code:`--private-tmp` is given. +After user changes, i.e. cannot be altered by the user with :code:`ch-run`: + +:code:`$CH_RUNNING` + Set to :code:`Weird Al Yankovic`. While a process can figure out that it’s + in an unprivileged container and what namespaces are active without this + hint, that can be messy, and there is no way to tell that it’s a + *Charliecloud* container specifically. This variable makes such a test + simple and well-defined. + Setting variables with :code:`--set-env` or :code:`--set-env0` -------------------------------------------------------------- @@ -760,4 +985,6 @@ status is 1 regardless of the signal value. .. include:: ./see_also.rst .. LocalWords: mtune NEWROOT hugetlbfs UsrMerge fusermount mybox IMG HOSTPATH -.. LocalWords: noprofile norc SHLVL PWD kernelnewbies extglob +.. LocalWords: noprofile norc SHLVL PWD kernelnewbies extglob cdi AMMVs dri +.. LocalWords: Desiniotis declarativeness fabricmanager libglxserver ctk +.. LocalWords: libcuda ldcache diff --git a/doc/dev.rst b/doc/dev.rst index 4a3f53284..4c5f85236 100644 --- a/doc/dev.rst +++ b/doc/dev.rst @@ -941,6 +941,64 @@ characters. C code ------ +Memory management +~~~~~~~~~~~~~~~~~ + +*TL;DR:* Charliecloud does not free any memory. You can enable garbage +collection with :code:`libgc` if you want, and this is the default, but it may +not be necessary, i.e. simply leaking all allocated memory could still be +smaller than the overhead of trying to clean up. + +*How-To:* (1) Use Charliecloud wrappers for all library functions that +allocate memory, e.g. :code:`ch_malloc()` instead of :code:`malloc(3)`. +Importantly, this includes things like :code:`strdup(3)` and +:code:`asprintf(3)`. (2) Don’t call :code:`free(3)` or any other library +functions that free memory. + +:code:`ch-run.c` has, since `very nearly the beginning +`_, carried the notice +that it “does not bother to free memory allocations, since they are modest and +the program is short-lived”. Explicit memory management is difficult and +time-consuming, and it didn’t seem worth the effort. + +Eventually, we grew a `long-running process +`_ to serve a +SquashFUSE filesystem, and the short-lived justification became obsolete. The +rough goal became: convert to proper memory management, freeing everything +that we allocated. Various :code:`free(3)` crept in here and there, but a full +refactor was never a priority. + +Then `PR #1919 `_ came to be +and grew in scope until it was a significant refactor. We tried to Do It Right +on memory management everywhere this PR touched, and we did, until Reid got +fed up writing comments about whose problem it was to free this or that and +copying data simply so those comments could be tractable. + +So now we’re back full circle. Memory management is not worth Charliecloud +developers’ time. We gleefully :code:`malloc(3)` and :code:`realloc(3)` +without a care in the world, sinning every time. But now you have options. You +can either: + +1. YOLO, i.e. simply never free anything, i.e. leak like a sieve. But + Charliecloud is still a small program and it’s unlikely to be an actual + problem. Our quick-and-dirty tests with a small “hello world” Alpine image + running :code:`true(1)` show a main :code:`ch-run` process using 350 KiB + just before it executes the user program, and the SquashFUSE process the + same just before forking and 1,600 KiB upon exit. + +2. Link with :code:`libgc`, i.e. the `Boehm-Demers-Weiser + `_ conservative garbage collector. The idea is + that garbage collection scans the stack, heap, and other pointer sources + for integers that *look* like pointers and assumes they *are* pointers. + Apparently it `works quite well `_ and + can even be faster than explicit memory management in some cases. The + quick-and-dirty tests show 900 KiB by the main process, and the SquashFUSE + process the same just before forking (after an explicit garbage collection) + and 2,200 KiB upon exit. + +:code:`ch-run` logs memory usage to syslog, and also stderr with :code:`-vv`, +so you can analyze your specific situation. + :code:`const` ~~~~~~~~~~~~~ @@ -1054,19 +1112,19 @@ computed, but it’s all in raw hex and hard to interpret, e.g.:: $ ch-run --seccomp -vv alpine:3.17 -- true [...] - ch-run[62763]: seccomp: arch c00000b7: found 13 syscalls (ch_core.c:582) - ch-run[62763]: seccomp: arch 40000028: found 27 syscalls (ch_core.c:582) + ch-run[62763]: seccomp: arch c00000b7: found 13 syscalls (core.c:582) + ch-run[62763]: seccomp: arch 40000028: found 27 syscalls (core.c:582) [...] - ch-run[62763]: seccomp(2) program has 156 instructions (ch_core.c:591) - ch-run[62763]: 0: { op=20 k= 4 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 1: { op=15 k=c00000b7 jt= 0 jf= 17 } (ch_core.c:423) - ch-run[62763]: 2: { op=20 k= 0 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 3: { op=15 k= 5b jt=145 jf= 0 } (ch_core.c:423) + ch-run[62763]: seccomp(2) program has 156 instructions (core.c:591) + ch-run[62763]: 0: { op=20 k= 4 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 1: { op=15 k=c00000b7 jt= 0 jf= 17 } (core.c:423) + ch-run[62763]: 2: { op=20 k= 0 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 3: { op=15 k= 5b jt=145 jf= 0 } (core.c:423) [...] - ch-run[62763]: 154: { op= 6 k=7fff0000 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: 155: { op= 6 k= 50000 jt= 0 jf= 0 } (ch_core.c:423) - ch-run[62763]: note: see FAQ to disassemble the above (ch_core.c:676) - ch-run[62763]: executing: true (ch_core.c:538) + ch-run[62763]: 154: { op= 6 k=7fff0000 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: 155: { op= 6 k= 50000 jt= 0 jf= 0 } (core.c:423) + ch-run[62763]: note: see FAQ to disassemble the above (core.c:676) + ch-run[62763]: executing: true (core.c:538) You can instead use `seccomp-tools `_ to disassemble and pretty-print diff --git a/doc/faq.rst b/doc/faq.rst index 83ba73e8e..c30c29101 100644 --- a/doc/faq.rst +++ b/doc/faq.rst @@ -201,7 +201,7 @@ handling. For example:: $ ch-run /var/tmp/hello -- /bin/echo foo - ch-run[154334]: error: can’t execve(2): /bin/echo: Permission denied (ch_core.c:387 13) + ch-run[154334]: error: can’t execve(2): /bin/echo: Permission denied (core.c:387 13) But :code:`/bin/echo` *does* have execute permission:: diff --git a/misc/gdb-backtrace b/misc/gdb-backtrace new file mode 100755 index 000000000..faddbb1a1 --- /dev/null +++ b/misc/gdb-backtrace @@ -0,0 +1,21 @@ +#!/bin/bash + +# $1 executable +# $2 core dump, or directory containing core dumps, in which case pick newest + +bin=$1 +core=$2 + +if [[ -d $core ]]; then + # kludge but good enough for now (https://stackoverflow.com/q/1015678) + printf "$core is a directory\n" 1>&2 + core=$core/$(ls -At $core | head -1) + printf "using $core\n" 1>&2 +fi + +gdb -batch $bin $core \ + -ex 'set style enabled on' \ + -ex 'set print pretty on' \ + -ex 'set print frame-info source-and-location' \ + -ex 'echo \n\n' \ + -ex 'backtrace -full'