Skip to content

Commit c748a25

Browse files
author
Alexei Starovoitov
committed
Merge branch 'selftests-bpf-fix-for-bpf_signal-stalls-watchdog-for-test_progs'
Eduard Zingerman says: ==================== selftests/bpf: fix for bpf_signal stalls, watchdog for test_progs Test case 'bpf_signal' had been recently reported to stall, both on the mailing list [1] and CI [2]. The stall is caused by CPU cycles perf event not being delivered within expected time frame, before test process enters system call and waits indefinitely. This patch-set addresses the issue in several ways: - A watchdog timer is added to test_progs.c runner: - it prints current sub-test name to stderr if sub-test takes longer than 10 seconds to finish; - it terminates process executing sub-test if sub-test takes longer than 120 seconds to finish. - The test case is updated to await perf event notification with a timeout and a few retries, this serves two purposes: - busy loops longer to increase the time frame for CPU cycles event generation/delivery; - makes a timeout, not stall, a worst case scenario. - The test case is updated to lower frequency of perf events, as high frequency of such events caused events generation throttling, which in turn delayed events delivery by amount of time sufficient to cause test case failure. Note: librt pthread-based timer API is used to implement watchdog timer. I chose this API over SIGALRM because signal handler execution within test process context was sufficient to trigger perf event delivery for send_signal/send_signal_nmi_thread_remote test case, w/o any additional changes. Thus I concluded that SIGALRM based implementation interferes with tests execution. [1] https://lore.kernel.org/bpf/CAP01T75OUeE8E-Lw9df84dm8ag2YmHW619f1DmPSVZ5_O89+Bg@mail.gmail.com/ [2] https://github.com/kernel-patches/bpf/actions/runs/11791485271/job/32843996871 ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 47e2c45 + 4edab4c commit c748a25

File tree

8 files changed

+166
-20
lines changed

8 files changed

+166
-20
lines changed

tools/testing/selftests/bpf/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c \
742742
unpriv_helpers.c \
743743
netlink_helpers.c \
744744
jit_disasm_helpers.c \
745+
io_helpers.c \
745746
test_loader.c \
746747
xsk.c \
747748
disasm.c \
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <sys/select.h>
3+
#include <unistd.h>
4+
#include <errno.h>
5+
6+
int read_with_timeout(int fd, char *buf, size_t count, long usec)
7+
{
8+
const long M = 1000 * 1000;
9+
struct timeval tv = { usec / M, usec % M };
10+
fd_set fds;
11+
int err;
12+
13+
FD_ZERO(&fds);
14+
FD_SET(fd, &fds);
15+
err = select(fd + 1, &fds, NULL, NULL, &tv);
16+
if (err < 0)
17+
return err;
18+
if (FD_ISSET(fd, &fds))
19+
return read(fd, buf, count);
20+
return -EAGAIN;
21+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <unistd.h>
3+
4+
/* As a regular read(2), but allows to specify a timeout in micro-seconds.
5+
* Returns -EAGAIN on timeout.
6+
*/
7+
int read_with_timeout(int fd, char *buf, size_t count, long usec);

tools/testing/selftests/bpf/prog_tests/bpf_iter.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,10 +265,10 @@ static void *run_test_task_tid(void *arg)
265265

266266
linfo.task.tid = 0;
267267
linfo.task.pid = getpid();
268-
/* This includes the parent thread, this thread,
268+
/* This includes the parent thread, this thread, watchdog timer thread
269269
* and the do_nothing_wait thread
270270
*/
271-
test_task_common(&opts, 2, 1);
271+
test_task_common(&opts, 3, 1);
272272

273273
test_task_common_nocheck(NULL, &num_unknown_tid, &num_known_tid);
274274
ASSERT_GT(num_unknown_tid, 2, "check_num_unknown_tid");
@@ -297,7 +297,7 @@ static void test_task_pid(void)
297297
opts.link_info = &linfo;
298298
opts.link_info_len = sizeof(linfo);
299299

300-
test_task_common(&opts, 1, 1);
300+
test_task_common(&opts, 2, 1);
301301
}
302302

303303
static void test_task_pidfd(void)
@@ -315,7 +315,7 @@ static void test_task_pidfd(void)
315315
opts.link_info = &linfo;
316316
opts.link_info_len = sizeof(linfo);
317317

318-
test_task_common(&opts, 1, 1);
318+
test_task_common(&opts, 2, 1);
319319

320320
close(pidfd);
321321
}

tools/testing/selftests/bpf/prog_tests/iters.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,8 @@ static void subtest_task_iters(void)
192192
syscall(SYS_getpgid);
193193
iters_task__detach(skel);
194194
ASSERT_EQ(skel->bss->procs_cnt, 1, "procs_cnt");
195-
ASSERT_EQ(skel->bss->threads_cnt, thread_num + 1, "threads_cnt");
196-
ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 1, "proc_threads_cnt");
195+
ASSERT_EQ(skel->bss->threads_cnt, thread_num + 2, "threads_cnt");
196+
ASSERT_EQ(skel->bss->proc_threads_cnt, thread_num + 2, "proc_threads_cnt");
197197
ASSERT_EQ(skel->bss->invalid_cnt, 0, "invalid_cnt");
198198
pthread_mutex_unlock(&do_nothing_mutex);
199199
for (int i = 0; i < thread_num; i++)

tools/testing/selftests/bpf/prog_tests/send_signal.c

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <sys/time.h>
44
#include <sys/resource.h>
55
#include "test_send_signal_kern.skel.h"
6+
#include "io_helpers.h"
67

78
static int sigusr1_received;
89

@@ -24,6 +25,7 @@ static void test_send_signal_common(struct perf_event_attr *attr,
2425
int pipe_c2p[2], pipe_p2c[2];
2526
int err = -1, pmu_fd = -1;
2627
volatile int j = 0;
28+
int retry_count;
2729
char buf[256];
2830
pid_t pid;
2931
int old_prio;
@@ -163,21 +165,25 @@ static void test_send_signal_common(struct perf_event_attr *attr,
163165
/* notify child that bpf program can send_signal now */
164166
ASSERT_EQ(write(pipe_p2c[1], buf, 1), 1, "pipe_write");
165167

166-
/* For the remote test, the BPF program is triggered from this
167-
* process but the other process/thread is signaled.
168-
*/
169-
if (remote) {
170-
if (!attr) {
171-
for (int i = 0; i < 10; i++)
172-
usleep(1);
173-
} else {
174-
for (int i = 0; i < 100000000; i++)
175-
j /= i + 1;
168+
for (retry_count = 0;;) {
169+
/* For the remote test, the BPF program is triggered from this
170+
* process but the other process/thread is signaled.
171+
*/
172+
if (remote) {
173+
if (!attr) {
174+
for (int i = 0; i < 10; i++)
175+
usleep(1);
176+
} else {
177+
for (int i = 0; i < 100000000; i++)
178+
j /= i + 1;
179+
}
176180
}
181+
/* wait for result */
182+
err = read_with_timeout(pipe_c2p[0], buf, 1, 100);
183+
if (err == -EAGAIN && retry_count++ < 10000)
184+
continue;
185+
break;
177186
}
178-
179-
/* wait for result */
180-
err = read(pipe_c2p[0], buf, 1);
181187
if (!ASSERT_GE(err, 0, "reading pipe"))
182188
goto disable_pmu;
183189
if (!ASSERT_GT(err, 0, "reading pipe error: size 0")) {
@@ -223,7 +229,8 @@ static void test_send_signal_perf(bool signal_thread, bool remote)
223229
static void test_send_signal_nmi(bool signal_thread, bool remote)
224230
{
225231
struct perf_event_attr attr = {
226-
.sample_period = 1,
232+
.freq = 1,
233+
.sample_freq = 1000,
227234
.type = PERF_TYPE_HARDWARE,
228235
.config = PERF_COUNT_HW_CPU_CYCLES,
229236
};

tools/testing/selftests/bpf/test_progs.c

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <sys/socket.h>
1717
#include <sys/un.h>
1818
#include <bpf/btf.h>
19+
#include <time.h>
1920
#include "json_writer.h"
2021

2122
#include "network_helpers.h"
@@ -179,6 +180,88 @@ int usleep(useconds_t usec)
179180
return syscall(__NR_nanosleep, &ts, NULL);
180181
}
181182

183+
/* Watchdog timer is started by watchdog_start() and stopped by watchdog_stop().
184+
* If timer is active for longer than env.secs_till_notify,
185+
* it prints the name of the current test to the stderr.
186+
* If timer is active for longer than env.secs_till_kill,
187+
* it kills the thread executing the test by sending a SIGSEGV signal to it.
188+
*/
189+
static void watchdog_timer_func(union sigval sigval)
190+
{
191+
struct itimerspec timeout = {};
192+
char test_name[256];
193+
int err;
194+
195+
if (env.subtest_state)
196+
snprintf(test_name, sizeof(test_name), "%s/%s",
197+
env.test->test_name, env.subtest_state->name);
198+
else
199+
snprintf(test_name, sizeof(test_name), "%s",
200+
env.test->test_name);
201+
202+
switch (env.watchdog_state) {
203+
case WD_NOTIFY:
204+
fprintf(env.stderr_saved, "WATCHDOG: test case %s executes for %d seconds...\n",
205+
test_name, env.secs_till_notify);
206+
timeout.it_value.tv_sec = env.secs_till_kill - env.secs_till_notify;
207+
env.watchdog_state = WD_KILL;
208+
err = timer_settime(env.watchdog, 0, &timeout, NULL);
209+
if (err)
210+
fprintf(env.stderr_saved, "Failed to arm watchdog timer\n");
211+
break;
212+
case WD_KILL:
213+
fprintf(env.stderr_saved,
214+
"WATCHDOG: test case %s executes for %d seconds, terminating with SIGSEGV\n",
215+
test_name, env.secs_till_kill);
216+
pthread_kill(env.main_thread, SIGSEGV);
217+
break;
218+
}
219+
}
220+
221+
static void watchdog_start(void)
222+
{
223+
struct itimerspec timeout = {};
224+
int err;
225+
226+
if (env.secs_till_kill == 0)
227+
return;
228+
if (env.secs_till_notify > 0) {
229+
env.watchdog_state = WD_NOTIFY;
230+
timeout.it_value.tv_sec = env.secs_till_notify;
231+
} else {
232+
env.watchdog_state = WD_KILL;
233+
timeout.it_value.tv_sec = env.secs_till_kill;
234+
}
235+
err = timer_settime(env.watchdog, 0, &timeout, NULL);
236+
if (err)
237+
fprintf(env.stderr_saved, "Failed to start watchdog timer\n");
238+
}
239+
240+
static void watchdog_stop(void)
241+
{
242+
struct itimerspec timeout = {};
243+
int err;
244+
245+
env.watchdog_state = WD_NOTIFY;
246+
err = timer_settime(env.watchdog, 0, &timeout, NULL);
247+
if (err)
248+
fprintf(env.stderr_saved, "Failed to stop watchdog timer\n");
249+
}
250+
251+
static void watchdog_init(void)
252+
{
253+
struct sigevent watchdog_sev = {
254+
.sigev_notify = SIGEV_THREAD,
255+
.sigev_notify_function = watchdog_timer_func,
256+
};
257+
int err;
258+
259+
env.main_thread = pthread_self();
260+
err = timer_create(CLOCK_MONOTONIC, &watchdog_sev, &env.watchdog);
261+
if (err)
262+
fprintf(stderr, "Failed to initialize watchdog timer\n");
263+
}
264+
182265
static bool should_run(struct test_selector *sel, int num, const char *name)
183266
{
184267
int i;
@@ -515,6 +598,7 @@ bool test__start_subtest(const char *subtest_name)
515598

516599
env.subtest_state = subtest_state;
517600
stdio_hijack_init(&subtest_state->log_buf, &subtest_state->log_cnt);
601+
watchdog_start();
518602

519603
return true;
520604
}
@@ -780,6 +864,7 @@ enum ARG_KEYS {
780864
ARG_DEBUG = -1,
781865
ARG_JSON_SUMMARY = 'J',
782866
ARG_TRAFFIC_MONITOR = 'm',
867+
ARG_WATCHDOG_TIMEOUT = 'w',
783868
};
784869

785870
static const struct argp_option opts[] = {
@@ -810,6 +895,8 @@ static const struct argp_option opts[] = {
810895
{ "traffic-monitor", ARG_TRAFFIC_MONITOR, "NAMES", 0,
811896
"Monitor network traffic of tests with name matching the pattern (supports '*' wildcard)." },
812897
#endif
898+
{ "watchdog-timeout", ARG_WATCHDOG_TIMEOUT, "SECONDS", 0,
899+
"Kill the process if tests are not making progress for specified number of seconds." },
813900
{},
814901
};
815902

@@ -1035,6 +1122,16 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state)
10351122
true);
10361123
break;
10371124
#endif
1125+
case ARG_WATCHDOG_TIMEOUT:
1126+
env->secs_till_kill = atoi(arg);
1127+
if (env->secs_till_kill < 0) {
1128+
fprintf(stderr, "Invalid watchdog timeout: %s.\n", arg);
1129+
return -EINVAL;
1130+
}
1131+
if (env->secs_till_kill < env->secs_till_notify) {
1132+
env->secs_till_notify = 0;
1133+
}
1134+
break;
10381135
default:
10391136
return ARGP_ERR_UNKNOWN;
10401137
}
@@ -1263,10 +1360,12 @@ static void run_one_test(int test_num)
12631360

12641361
stdio_hijack(&state->log_buf, &state->log_cnt);
12651362

1363+
watchdog_start();
12661364
if (test->run_test)
12671365
test->run_test();
12681366
else if (test->run_serial_test)
12691367
test->run_serial_test();
1368+
watchdog_stop();
12701369

12711370
/* ensure last sub-test is finalized properly */
12721371
if (env.subtest_state)
@@ -1707,6 +1806,7 @@ static int worker_main_send_subtests(int sock, struct test_state *state)
17071806
static int worker_main(int sock)
17081807
{
17091808
save_netns();
1809+
watchdog_init();
17101810

17111811
while (true) {
17121812
/* receive command */
@@ -1816,6 +1916,8 @@ int main(int argc, char **argv)
18161916

18171917
sigaction(SIGSEGV, &sigact, NULL);
18181918

1919+
env.secs_till_notify = 10;
1920+
env.secs_till_kill = 120;
18191921
err = argp_parse(&argp, argc, argv, 0, NULL, &env);
18201922
if (err)
18211923
return err;
@@ -1824,6 +1926,8 @@ int main(int argc, char **argv)
18241926
if (err)
18251927
return err;
18261928

1929+
watchdog_init();
1930+
18271931
/* Use libbpf 1.0 API mode */
18281932
libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
18291933
libbpf_set_print(libbpf_print_fn);

tools/testing/selftests/bpf/test_progs.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,12 @@ struct test_env {
131131
pid_t *worker_pids; /* array of worker pids */
132132
int *worker_socks; /* array of worker socks */
133133
int *worker_current_test; /* array of current running test for each worker */
134+
135+
pthread_t main_thread;
136+
int secs_till_notify;
137+
int secs_till_kill;
138+
timer_t watchdog; /* watch for stalled tests/subtests */
139+
enum { WD_NOTIFY, WD_KILL } watchdog_state;
134140
};
135141

136142
#define MAX_LOG_TRUNK_SIZE 8192

0 commit comments

Comments
 (0)