Skip to content

Commit 148eb58

Browse files
committed
DEBUG: wdt: better detect apparently locked up threads and warn about them
In order to help users detect when threads are behaving abnormally, let's try to emit a warning when one is no longer making any progress. This will allow to catch faulty situations more accurately, instead of occasionally triggering just after the long task. It will also let users know that there is something wrong with their configuration, and inspect the call trace to figure whether they're using excessively long rules or Lua for example (the usual warnings about lua-load vs lua-load-per-thread are still reported). The warning will only be emitted for threads not yet marked as stuck so as not to interfere with panic dumps and avoid sending a warning just before a panic. A tainted flag is set when this happens however (0x2000).
1 parent 0950778 commit 148eb58

File tree

3 files changed

+13
-2
lines changed

3 files changed

+13
-2
lines changed

include/haproxy/bug.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ enum tainted_flags {
421421
TAINTED_LUA_STUCK = 0x00000400, /* stuck in a Lua context */
422422
TAINTED_LUA_STUCK_SHARED = 0x00000800, /* stuck in a shared Lua context */
423423
TAINTED_MEM_TRIMMING_STUCK = 0x00001000, /* stuck while trimming memory */
424+
TAINTED_WARN_BLOCKED_TRAFFIC = 0x00002000, /* emitted a warning about blocked traffic */
424425
};
425426

426427
/* this is a bit field made of TAINTED_*, and is declared in haproxy.c */

src/debug.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,7 @@ void ha_stuck_warning(int thr)
738738
struct buffer buf;
739739
ullong n, p;
740740

741-
if (get_tainted() & TAINTED_PANIC) {
741+
if (mark_tainted(TAINTED_WARN_BLOCKED_TRAFFIC) & TAINTED_PANIC) {
742742
/* a panic dump is already in progress, let's not disturb it,
743743
* we'll be called via signal DEBUGSIG. By returning we may be
744744
* able to leave a current signal handler (e.g. WDT) so that

src/wdt.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <signal.h>
1313
#include <time.h>
1414

15+
#include <haproxy/activity.h>
1516
#include <haproxy/api.h>
1617
#include <haproxy/clock.h>
1718
#include <haproxy/debug.h>
@@ -38,6 +39,7 @@
3839
*/
3940
static struct {
4041
timer_t timer;
42+
uint prev_ctxsw;
4143
} per_thread_wd_ctx[MAX_THREADS];
4244

4345
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
@@ -106,10 +108,18 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
106108
* scheduler is still alive by setting the TH_FL_STUCK flag
107109
* that the scheduler clears when switching to the next task.
108110
* If it's already set, then it's our second call with no
109-
* progress and the thread is dead.
111+
* progress and the thread is dead. However, if we figure
112+
* that the scheduler made no progress since last time, we'll
113+
* at least emit a warning.
110114
*/
111115
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
116+
uint prev_ctxsw;
117+
112118
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
119+
prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[tid].prev_ctxsw);
120+
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
121+
ha_stuck_warning(thr);
122+
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
113123
goto update_and_leave;
114124
}
115125

0 commit comments

Comments
 (0)