diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b18fb5fcb38e2..03a0782c94bfd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -125,6 +125,9 @@ int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); +ssize_t cgroup_kn_interface_write(struct kernfs_node *kn, const char *name__str, + const char *buf, size_t nbytes, loff_t off); + int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6b4877e85a68c..5efc1bc57db90 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2605,6 +2605,50 @@ bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) return NULL; return cgrp; } + +#define BPF_CGROUP_MAX_WRITE ((1UL << 24) - 1) + +/** + * bpf_cgroup_write_interface - Writes to a cgroup interface file. + * @cgrp: The target cgroup + * @name__str: name of the cgroup core interface file + * @value_p: value to write + * @off: offset + * + * Return: number of bytes written on success, a negative value on error. + */ +__bpf_kfunc int +bpf_cgroup_write_interface(struct cgroup *cgrp, const char *name__str, + const struct bpf_dynptr *value_p, loff_t off) +{ + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + struct kernfs_node *kn; + const void *value; + u32 value_len; + int ret; + + value_len = __bpf_dynptr_size(value_ptr); + if (!value_len) + return 0; + + if (value_len > BPF_CGROUP_MAX_WRITE) + return -E2BIG; + + value = __bpf_dynptr_data(value_ptr, value_len); + if (!value) + return -EINVAL; + + rcu_read_lock(); + kn = cgrp->kn; + rcu_read_unlock(); + + kernfs_get(kn); + ret = cgroup_kn_interface_write(kn, name__str, value, value_len, off); + kernfs_put(kn); + + return ret; +} + #endif /* CONFIG_CGROUPS */ /** @@ -3736,6 +3780,7 @@ BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU) BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_write_interface, KF_TRUSTED_ARGS | KF_SLEEPABLE) #endif BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 312c6a8b55bb7..cddd7c1d354df 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -229,6 +229,24 @@ static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; static struct cftype cgroup_psi_files[]; +struct cgroup_kn_cftype { + char name[MAX_CFTYPE_NAME]; + unsigned int namelen; + + /* + * write() is the write operation on a kernfs node. + */ + ssize_t (*write)(struct kernfs_node *kn, const char *buf, size_t nbytes, + loff_t off, bool revalidate); +}; + +#define CGROUP_PREFIX "cgroup." +#define CGROUP_CORE_INTERFACE_FREEZE_SUFFIX "freeze" +#define CGROUP_CORE_INTERFACE_FREEZE (CGROUP_PREFIX CGROUP_CORE_INTERFACE_FREEZE_SUFFIX) +#define CGROUP_CORE_INTERFACE_FREEZE_LEN (sizeof(CGROUP_CORE_INTERFACE_FREEZE) - 1) + +static struct cgroup_kn_cftype kn_cfts[]; + /* cgroup optional features */ enum cgroup_opt_features { #ifdef CONFIG_PSI @@ -4030,29 +4048,58 @@ static int cgroup_freeze_show(struct seq_file *seq, void *v) return 0; } -static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +static bool cgroup_kn_revalidate(struct cgroup *cgrp) +{ + if (!cgroup_on_dfl(cgrp) || !cgroup_parent(cgrp)) + return false; + + return true; +} + +static ssize_t cgroup_kn_freeze(struct kernfs_node *kn, + const char *buf, size_t nbytes, loff_t off, + bool revalidate) { struct cgroup *cgrp; ssize_t ret; int freeze; + char b[4] = {0}; + + /* Handle userspace writes +(0|1)\n and fail otherwise */ + ret = strscpy(b, buf, sizeof(b)); + if (ret < 0) + return ret; - ret = kstrtoint(strstrip(buf), 0, &freeze); + nbytes = ret; + ret = kstrtoint(strstrip(b), 0, &freeze); if (ret) return ret; if (freeze < 0 || freeze > 1) return -ERANGE; - cgrp = cgroup_kn_lock_live(of->kn, false); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return -ENOENT; + if (revalidate && !cgroup_kn_revalidate(cgrp)) { + ret = -EOPNOTSUPP; + goto out; + } + cgroup_freeze(cgrp, freeze); - cgroup_kn_unlock(of->kn); + ret = nbytes; - return nbytes; +out: + cgroup_kn_unlock(kn); + return ret; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return cgroup_kn_freeze(of->kn, buf, nbytes, off, false); } static void __cgroup_kill(struct cgroup *cgrp) @@ -4601,6 +4648,49 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) kernfs_put(kn); } +static struct cgroup_kn_cftype kn_cfts[] = { + { + .name = CGROUP_CORE_INTERFACE_FREEZE, + .namelen = CGROUP_CORE_INTERFACE_FREEZE_LEN, + .write = cgroup_kn_freeze, + }, + { }, +}; + +static const struct cgroup_kn_cftype *cgroup_kn_cft(const char *name__str) +{ + struct cgroup_kn_cftype *kn_cft; + + for (kn_cft = kn_cfts; kn_cft && kn_cft->name[0] != '\0'; kn_cft++) { + if (!strncmp(name__str, kn_cft->name, kn_cft->namelen)) + return kn_cft; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +ssize_t cgroup_kn_interface_write(struct kernfs_node *kn, const char *name__str, + const char *buf, size_t nbytes, loff_t off) +{ + const struct cgroup_kn_cftype *kn_cft; + + /* empty, do not remove */ + if (!nbytes) + return 0; + + if (kernfs_type(kn) != KERNFS_DIR) + return -ENOTDIR; + + kn_cft = cgroup_kn_cft(name__str); + if (IS_ERR(kn_cft)) + return PTR_ERR(kn_cft); + + if (unlikely(!kn_cft->write)) + return -EOPNOTSUPP; + + return kn_cft->write(kn, buf, nbytes, off, true); +} + /** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) diff --git a/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c b/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c new file mode 100644 index 0000000000000..d4e9c0f321964 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/task_freeze_cgroup.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include "test_task_freeze_cgroup.skel.h" + +#define CGROUP_PATH "/test-task-freeze-cgroup" + +static int bpf_sleepable(struct test_task_freeze_cgroup *skel) +{ + int err, cgroup_fd; + pid_t new_pid2; + + cgroup_fd = cgroup_setup_and_join(CGROUP_PATH); + if (!ASSERT_OK(cgroup_fd < 0, "cgroup_setup_and_join")) + return -errno; + + skel = test_task_freeze_cgroup__open(); + if (!ASSERT_OK_PTR(skel, "test_task_freeze_cgroup__open")) { + err = -errno; + goto cleanup_cgroup; + } + + skel->rodata->parent_pid = getppid(); + skel->rodata->monitor_pid = getpid(); + skel->rodata->cgid = get_cgroup_id(CGROUP_PATH); + skel->bss->new_pid = getpid(); + skel->bss->freeze = 1; + + err = test_task_freeze_cgroup__load(skel); + if (!ASSERT_OK(err, "test_task_freeze_cgroup__load")) { + err = -errno; + goto cleanup_skel; + } + + /* First attach the LSM Program that is triggered on bpf() calls + * especially on TP_BTF programs when attached. + */ + skel->links.lsm_freeze_cgroup = + bpf_program__attach_lsm(skel->progs.lsm_freeze_cgroup); + if (!ASSERT_OK_PTR(skel->links.lsm_freeze_cgroup, "attach_lsm")) { + err = -errno; + goto cleanup_detach; + } + + /* Attaching this must fail with -EPERM and freeze current task */ + skel->links.tp_newchild = + bpf_program__attach_trace(skel->progs.tp_newchild); + if (!ASSERT_EQ(errno, EPERM, "attach_trace() must fail here")) { + err = -EINVAL; + goto cleanup_detach; + } + + /* Continue */ + + /* Attach again now with success */ + skel->links.tp_newchild = + bpf_program__attach_trace(skel->progs.tp_newchild); + if (!ASSERT_OK_PTR(skel->links.tp_newchild, "attach_trace")) { + err = -EINVAL; + goto cleanup_detach; + } + + /* Fork, update vars from BPF and assert the unfrozen state */ + new_pid2 = fork(); + if (new_pid2 == 0) + exit(0); + + err = (new_pid2 == -1); + if (ASSERT_OK(err, "fork process")) + wait(NULL); + + /* Now assert that new_pid2 reflects this new child */ + ASSERT_NEQ(0, skel->bss->new_pid, + "test task_freeze_cgroup failed at new_pid != 0"); + ASSERT_NEQ(skel->rodata->monitor_pid, skel->bss->new_pid, + "test task_freeze_cgroup failed at old monitor_pid != new_pid"); + /* Assert that bpf sets new_pid to new forked child new_pid2 */ + ASSERT_EQ(skel->bss->new_pid, new_pid2, + "test task_freeze_cgroup failed first child new_pid == new_pid2"); + +cleanup_detach: + test_task_freeze_cgroup__detach(skel); +cleanup_skel: + test_task_freeze_cgroup__destroy(skel); +cleanup_cgroup: + close(cgroup_fd); + cleanup_cgroup_environment(); + return err; +} + +void test_task_freeze_cgroup(void) +{ + pid_t pid, result; + char buf[512] = {0}; + char path[PATH_MAX] = {0}; + int ret, status, attempts, frozen = 0, fd; + struct test_task_freeze_cgroup *skel = NULL; + + pid = fork(); + ret = (pid == -1); + if (!ASSERT_OK(ret, "fork process")) + return; + + if (pid == 0) { + ret = bpf_sleepable(skel); + ASSERT_EQ(0, ret, "child bpf_sleepable failed"); + exit(ret); + } + + skel = test_task_freeze_cgroup__open(); + if (!ASSERT_OK_PTR(skel, "test_task_freeze_cgroup__open")) + goto out; + + snprintf(path, sizeof(path), + "/sys/fs/cgroup/cgroup-test-work-dir%d%s/cgroup.freeze", + pid, CGROUP_PATH); + + for (attempts = 10; attempts >= 0; attempts--) { + ret = 0; + + fd = open(path, O_RDONLY); + if (fd > 0) + ret = read(fd, buf, sizeof(buf) - 1); + if (ret > 0) { + errno = 0; + frozen = strtol(buf, NULL, 10); + if (errno) + frozen = 0; + } + + close(fd); + if (frozen) + break; + sleep(1); + } + + /* Assert that child cgroup is frozen */ + if (!ASSERT_EQ(1, frozen, "child cgroup not frozen")) + goto out; + + ret = test_task_freeze_cgroup__load(skel); + if (!ASSERT_OK(ret, "test_task_freeze_cgroup__load")) + goto out; + + /* Trigger the unthaw child cgroup from parent */ + skel->links.lsm_task_free = + bpf_program__attach_lsm(skel->progs.lsm_task_free); + if (!ASSERT_OK_PTR(skel->links.lsm_task_free, "attach_lsm")) + goto out; + + result = waitpid(pid, &status, WUNTRACED); + if (!ASSERT_NEQ(result, -1, "waitpid")) + goto detach; + + result = WIFEXITED(status); + if (!ASSERT_EQ(result, 1, "forked process did not terminate normally")) + goto detach; + + result = WEXITSTATUS(status); + if (!ASSERT_EQ(result, 0, "forked process did not exit successfully")) + goto detach; + +detach: + test_task_freeze_cgroup__detach(skel); + +out: + if (skel) + test_task_freeze_cgroup__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c b/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c new file mode 100644 index 0000000000000..07b4b65abc368 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_task_freeze_cgroup.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include "bpf_kfuncs.h" +#include "bpf_misc.h" + +struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; +long bpf_task_under_cgroup(struct task_struct *task, struct cgroup *ancestor) __ksym; +void bpf_cgroup_release(struct cgroup *p) __ksym; +struct task_struct *bpf_task_from_pid(s32 pid) __ksym; +struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; +void bpf_task_release(struct task_struct *p) __ksym; + +extern int bpf_cgroup_write_interface(struct cgroup *cgrp, + const char *name__str, + const struct bpf_dynptr *value_p, + loff_t off) __ksym __weak; + +char freeze_val[] = "1"; +char unthaw_val[] = "0"; + +const volatile int parent_pid; +const volatile int monitor_pid; +const volatile __u64 cgid; +int new_pid; +int freeze; + +SEC("tp_btf/task_newtask") +int BPF_PROG(tp_newchild, struct task_struct *task, u64 clone_flags) +{ + struct cgroup *cgrp = NULL; + struct task_struct *acquired; + + if (monitor_pid != (bpf_get_current_pid_tgid() >> 32)) + return 0; + + acquired = bpf_task_acquire(task); + if (!acquired) + return 0; + + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) + goto out; + + /* Update new_pid with current pid */ + if (bpf_task_under_cgroup(acquired, cgrp)) + new_pid = acquired->tgid; + +out: + if (cgrp) + bpf_cgroup_release(cgrp); + bpf_task_release(acquired); + + return 0; +} + +/* Try to attach from parent to trigger the bpf lsm hook, so from + * parent context we unthaw child cgroup. + */ +SEC("lsm/task_free") +int BPF_PROG(lsm_task_free, struct task_struct *task) +{ + return 0; +} + +static int process_freeze_cgroup(int pid, int freeze) +{ + int ret = 0; + struct task_struct *task; + struct bpf_dynptr dyn_ptr; + struct cgroup *cgrp = NULL; + + task = bpf_task_from_pid(pid); + if (!task) + return -EINVAL; + + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) { + ret = -EINVAL; + goto out; + } + + if (!bpf_task_under_cgroup(task, cgrp)) + goto out; + + if (freeze) + bpf_dynptr_from_mem(freeze_val, sizeof(freeze_val), 0, &dyn_ptr); + else + bpf_dynptr_from_mem(unthaw_val, sizeof(unthaw_val), 0, &dyn_ptr); + + ret = bpf_cgroup_write_interface(cgrp, "cgroup.freeze", &dyn_ptr, 0); + +out: + if (cgrp) + bpf_cgroup_release(cgrp); + bpf_task_release(task); + return ret; +} + +SEC("lsm.s/bpf") +int BPF_PROG(lsm_freeze_cgroup, int cmd, union bpf_attr *attr, unsigned int size) +{ + int ret = 0; + struct task_struct *task; + struct cgroup *cgrp = NULL; + + if (cmd != BPF_LINK_CREATE) + return 0; + + task = bpf_get_current_task_btf(); + if (parent_pid == task->pid) { + /* Parent context: unthaw child */ + process_freeze_cgroup(monitor_pid, 0); + return 0; + } + + /* Nothing todo */ + if (!freeze) + return 0; + + /* Child context */ + if (monitor_pid != task->pid) + return 0; + + /* Ensure we are under the corresponding cgroup so we freeze + * current child from its context + */ + cgrp = bpf_cgroup_from_id(cgid); + if (!cgrp) + return 0; + + if (!bpf_task_under_cgroup(task, cgrp)) + goto out; + + /* Schedule freeze task and return -EPERM */ + ret = process_freeze_cgroup(monitor_pid, freeze); + + /* On error or 0 we return zero and we catch at + * user space if the cgroup was not frozen. + */ + ret = (ret > 0) ? -EPERM : 0; + + /* Reset for next calls */ + freeze = 0; +out: + if (cgrp) + bpf_cgroup_release(cgrp); + return ret; +} + +char _license[] SEC("license") = "GPL";