Skip to content

Commit f2bac64

Browse files
rgushchinKernel Patches Daemon
authored andcommitted
bpf: selftests: PSI struct ops test
Add a PSI struct ops test. The test creates a cgroup with two child sub-cgroups, sets up memory.high for one of those and puts there a memory hungry process (initially frozen). The memory hungry task is creating a high memory pressure in one memory cgroup, which triggers a PSI event. The PSI BPF handler declares a memcg oom in the corresponding cgroup. Signed-off-by: Roman Gushchin <[email protected]>
1 parent 39e55f0 commit f2bac64

File tree

2 files changed

+315
-0
lines changed

2 files changed

+315
-0
lines changed
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
// SPDX-License-Identifier: GPL-2.0-only
2+
#include <test_progs.h>
3+
#include <bpf/btf.h>
4+
#include <bpf/bpf.h>
5+
6+
#include "cgroup_helpers.h"
7+
#include "test_psi.skel.h"
8+
9+
enum psi_res {
10+
PSI_IO,
11+
PSI_MEM,
12+
PSI_CPU,
13+
PSI_IRQ,
14+
NR_PSI_RESOURCES,
15+
};
16+
17+
struct cgroup_desc {
18+
const char *path;
19+
unsigned long long id;
20+
int pid;
21+
int fd;
22+
size_t target;
23+
size_t high;
24+
bool victim;
25+
};
26+
27+
#define MB (1024 * 1024)
28+
29+
static struct cgroup_desc cgroups[] = {
30+
{ .path = "/psi_test" },
31+
{ .path = "/psi_test/cg1" },
32+
{ .path = "/psi_test/cg2", .target = 500 * MB,
33+
.high = 40 * MB, .victim = true },
34+
};
35+
36+
static int spawn_task(struct cgroup_desc *desc)
37+
{
38+
char *ptr;
39+
int pid;
40+
41+
pid = fork();
42+
if (pid < 0)
43+
return pid;
44+
45+
if (pid > 0) {
46+
/* parent */
47+
desc->pid = pid;
48+
return 0;
49+
}
50+
51+
/* child */
52+
ptr = (char *)malloc(desc->target);
53+
if (!ptr)
54+
_exit(ENOMEM);
55+
56+
memset(ptr, 'a', desc->target);
57+
58+
while (1)
59+
sleep(1000);
60+
61+
return 0;
62+
}
63+
64+
static void setup_environment(void)
65+
{
66+
int i, err;
67+
68+
err = setup_cgroup_environment();
69+
if (!ASSERT_OK(err, "setup_cgroup_environment"))
70+
goto cleanup;
71+
72+
for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
73+
cgroups[i].fd = create_and_get_cgroup(cgroups[i].path);
74+
if (!ASSERT_GE(cgroups[i].fd, 0, "create_and_get_cgroup"))
75+
goto cleanup;
76+
77+
cgroups[i].id = get_cgroup_id(cgroups[i].path);
78+
if (!ASSERT_GT(cgroups[i].id, 0, "get_cgroup_id"))
79+
goto cleanup;
80+
81+
/* Freeze the top-level cgroup and enable the memory controller */
82+
if (i == 0) {
83+
err = write_cgroup_file(cgroups[i].path, "cgroup.freeze", "1");
84+
if (!ASSERT_OK(err, "freeze cgroup"))
85+
goto cleanup;
86+
87+
err = write_cgroup_file(cgroups[i].path, "cgroup.subtree_control",
88+
"+memory");
89+
if (!ASSERT_OK(err, "enable memory controller"))
90+
goto cleanup;
91+
}
92+
93+
/* Set memory.high */
94+
if (cgroups[i].high) {
95+
char buf[256];
96+
97+
snprintf(buf, sizeof(buf), "%lu", cgroups[i].high);
98+
err = write_cgroup_file(cgroups[i].path, "memory.high", buf);
99+
if (!ASSERT_OK(err, "set memory.high"))
100+
goto cleanup;
101+
102+
snprintf(buf, sizeof(buf), "0");
103+
write_cgroup_file(cgroups[i].path, "memory.swap.max", buf);
104+
}
105+
106+
/* Spawn tasks creating memory pressure */
107+
if (cgroups[i].target) {
108+
char buf[256];
109+
110+
err = spawn_task(&cgroups[i]);
111+
if (!ASSERT_OK(err, "spawn task"))
112+
goto cleanup;
113+
114+
snprintf(buf, sizeof(buf), "%d", cgroups[i].pid);
115+
err = write_cgroup_file(cgroups[i].path, "cgroup.procs", buf);
116+
if (!ASSERT_OK(err, "put child into a cgroup"))
117+
goto cleanup;
118+
}
119+
}
120+
121+
return;
122+
123+
cleanup:
124+
cleanup_cgroup_environment();
125+
}
126+
127+
static int run_and_wait_for_oom(void)
128+
{
129+
int ret = -1;
130+
bool first = true;
131+
char buf[4096] = {};
132+
ssize_t size;
133+
134+
/* Unfreeze the top-level cgroup */
135+
ret = write_cgroup_file(cgroups[0].path, "cgroup.freeze", "0");
136+
if (!ASSERT_OK(ret, "unfreeze cgroup"))
137+
return -1;
138+
139+
for (;;) {
140+
int i, status;
141+
pid_t pid = wait(&status);
142+
143+
if (pid == -1) {
144+
if (errno == EINTR)
145+
continue;
146+
/* ECHILD */
147+
break;
148+
}
149+
150+
if (!first)
151+
continue;
152+
first = false;
153+
154+
/* Check which process was terminated first */
155+
for (i = 0; i < ARRAY_SIZE(cgroups); i++) {
156+
if (!ASSERT_OK(cgroups[i].victim !=
157+
(pid == cgroups[i].pid),
158+
"correct process was killed")) {
159+
ret = -1;
160+
break;
161+
}
162+
163+
if (!cgroups[i].victim)
164+
continue;
165+
166+
/* Check the memcg oom counter */
167+
size = read_cgroup_file(cgroups[i].path, "memory.events",
168+
buf, sizeof(buf));
169+
if (!ASSERT_OK(size <= 0, "read memory.events")) {
170+
ret = -1;
171+
break;
172+
}
173+
174+
if (!ASSERT_OK(strstr(buf, "oom_kill 1") == NULL,
175+
"oom_kill count check")) {
176+
ret = -1;
177+
break;
178+
}
179+
}
180+
181+
/* Kill all remaining tasks */
182+
for (i = 0; i < ARRAY_SIZE(cgroups); i++)
183+
if (cgroups[i].pid && cgroups[i].pid != pid)
184+
kill(cgroups[i].pid, SIGKILL);
185+
}
186+
187+
return ret;
188+
}
189+
190+
void test_psi(void)
191+
{
192+
struct test_psi *skel;
193+
int cgroup_fd;
194+
int err;
195+
196+
setup_environment();
197+
198+
skel = test_psi__open_and_load();
199+
if (!ASSERT_OK_PTR(skel, "open_and_load"))
200+
goto cleanup;
201+
202+
skel->bss->high_pressure_cgroup_id = cgroups[2].id;
203+
skel->bss->my_pid = getpid();
204+
205+
err = test_psi__attach(skel);
206+
if (CHECK_FAIL(err))
207+
goto cleanup;
208+
209+
/* Delete the first cgroup, it used to trigger offline handler */
210+
remove_cgroup(cgroups[1].path);
211+
212+
/* Create new cgroup */
213+
cgroup_fd = create_and_get_cgroup("/psi_test_new");
214+
if (!ASSERT_GT(cgroup_fd, 0, "create_and_get_cgroup"))
215+
goto cleanup;
216+
217+
/* Unfreeze all child tasks and create the memory pressure */
218+
err = run_and_wait_for_oom();
219+
CHECK_FAIL(err);
220+
221+
close(cgroup_fd);
222+
cleanup:
223+
cleanup_cgroup_environment();
224+
test_psi__destroy(skel);
225+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#include "vmlinux.h"
2+
#include "bpf_experimental.h"
3+
#include <bpf/bpf_helpers.h>
4+
#include <bpf/bpf_tracing.h>
5+
#include <bpf/bpf_core_read.h>
6+
7+
char _license[] SEC("license") = "GPL";
8+
9+
/* cgroup which will experience the high memory pressure */
10+
u64 high_pressure_cgroup_id;
11+
u32 my_pid = 0;
12+
13+
/* last total full memory pressure value */
14+
u64 last_mem_full_total = 0;
15+
16+
extern struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
17+
extern void bpf_task_release(struct task_struct *p) __ksym;
18+
19+
struct elem {
20+
struct bpf_task_work tw;
21+
};
22+
23+
struct {
24+
__uint(type, BPF_MAP_TYPE_ARRAY);
25+
__uint(max_entries, 1);
26+
__type(key, int);
27+
__type(value, struct elem);
28+
} tw_map SEC(".maps");
29+
30+
static int psi_oom_work(struct bpf_map *map, void *key, void *value)
31+
{
32+
struct cgroup *cgrp;
33+
struct mem_cgroup *memcg;
34+
35+
cgrp = bpf_cgroup_from_id(high_pressure_cgroup_id);
36+
if (!cgrp)
37+
return 0;
38+
39+
memcg = bpf_get_mem_cgroup(&cgrp->self);
40+
if (memcg) {
41+
bpf_out_of_memory(memcg, 0, BPF_OOM_FLAGS_WAIT_ON_OOM_LOCK);
42+
bpf_put_mem_cgroup(memcg);
43+
}
44+
45+
bpf_cgroup_release(cgrp);
46+
return 0;
47+
}
48+
49+
static void schedule_oom_work(void)
50+
{
51+
struct task_struct *task;
52+
struct elem *val;
53+
int key = 0;
54+
55+
task = bpf_task_from_pid(my_pid);
56+
if (task) {
57+
val = bpf_map_lookup_elem(&tw_map, &key);
58+
if (val)
59+
bpf_task_work_schedule_signal(task, &val->tw,
60+
&tw_map, psi_oom_work);
61+
bpf_task_release(task);
62+
}
63+
}
64+
65+
SEC("tp_btf/psi_avgs_work")
66+
int BPF_PROG(psi_avgs, struct psi_group *group)
67+
{
68+
u64 current_total;
69+
u64 growth;
70+
71+
/* Monitor only a single target cgroup */
72+
if (group->cgroup_id != high_pressure_cgroup_id)
73+
return 0;
74+
75+
/* Check for memory pressure */
76+
current_total = BPF_CORE_READ(group, total[PSI_MEM_FULL]);
77+
if (last_mem_full_total == 0) {
78+
last_mem_full_total = current_total;
79+
return 0;
80+
}
81+
82+
growth = current_total - last_mem_full_total;
83+
last_mem_full_total = current_total;
84+
85+
/* Declare an OOM if growth > 50ms within the update period */
86+
if (growth > 50000000)
87+
schedule_oom_work();
88+
89+
return 0;
90+
}

0 commit comments

Comments
 (0)