Skip to content

Commit 6127e5a

Browse files
committed
DEBUG: wdt: make the blocked traffic warning delay configurable
The new global "warn-blocked-traffic-after" allows one to configure after how much time a warning should be emitted when traffic is blocked.
1 parent 7337c42 commit 6127e5a

File tree

3 files changed

+89
-9
lines changed

3 files changed

+89
-9
lines changed

doc/configuration.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,7 @@ The following keywords are supported in the "global" section :
15621562
* Debugging
15631563
- anonkey
15641564
- quiet
1565+
- warn-blocked-traffic-after
15651566
- zero-warning
15661567

15671568
* HTTPClient
@@ -4394,6 +4395,24 @@ quiet
43944395
Do not display any message during startup. It is equivalent to the command-
43954396
line argument "-q".
43964397

4398+
warn-blocked-traffic-after <time>
4399+
This allows to adjust the delay after which a stuck task blocking the traffic
4400+
will trigger the emission of a warning on the standard error output. The
4401+
delay is expressed in milliseconds and defaults to 1000 ms. Permitted values
4402+
must be comprised between 1 ms and 1000 ms included. Lower values will
4403+
trigger warnings frequently and higher ones will rarely. The watchdog will
4404+
kill a runaway task that fails to respond twice for one second anyway, so a
4405+
1000 ms warning delay will normally not trigger any warning. It is
4406+
recommended to stay with values between 10 and 100ms to detect configuration
4407+
anomalies that may degrade the user's experience, causing long response times
4408+
or jerkiness on interactive sessions. For example, a poorly designed Lua
4409+
sample-fetch function doing heavy computations, or a very large map_reg or
4410+
map_regm map file with a very high evaluation cost may cause such trouble.
4411+
For comparison a TLS handshake can eat between one and two milliseconds, and
4412+
compressing a 16kB HTTP response buffer is around one millisecond. The output
4413+
contains a thread dump of the offending task with a backtrace and some
4414+
context that helps figure where the time is being spent.
4415+
43974416
zero-warning
43984417
When this option is set, HAProxy will refuse to start if any warning was
43994418
emitted while processing the configuration and applying it. It means that

src/debug.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -760,9 +760,10 @@ void ha_stuck_warning(int thr)
760760
" this must remain exceptional because the system's stability is now at risk.\n"
761761
" Timers in logs may be reported incorrectly, spurious timeouts may happen,\n"
762762
" some incoming connections may silently be dropped, health checks may\n"
763-
" randomly fail, and accesses to the CLI may block the whole process. Please\n"
764-
" check the trace below for any clues about configuration elements that need\n"
765-
" to be corrected:\n\n",
763+
" randomly fail, and accesses to the CLI may block the whole process. The\n"
764+
" blocking delay before emitting this warning may be adjusted via the global\n"
765+
" 'warn-blocked-traffic-after' directive. Please check the trace below for\n"
766+
" any clues about configuration elements that need to be corrected:\n\n",
766767
thr + 1, (n - p) / 1000000ULL,
767768
HA_ATOMIC_LOAD(&ha_thread_ctx[thr].stream_cnt));
768769

src/wdt.c

Lines changed: 66 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include <haproxy/activity.h>
1616
#include <haproxy/api.h>
17+
#include <haproxy/cfgparse.h>
1718
#include <haproxy/clock.h>
1819
#include <haproxy/debug.h>
1920
#include <haproxy/errors.h>
@@ -42,6 +43,9 @@ static struct {
4243
uint prev_ctxsw;
4344
} per_thread_wd_ctx[MAX_THREADS];
4445

46+
/* warn about stuck tasks after this delay (ns) */
47+
static unsigned int wdt_warn_blocked_traffic_ns = 1000000000U;
48+
4549
/* Setup (or ping) the watchdog timer for thread <thr>. Returns non-zero on
4650
* success, zero on failure. It interrupts once per second of CPU time. It
4751
* happens that timers based on the CPU time are not automatically re-armed
@@ -51,7 +55,8 @@ int wdt_ping(int thr)
5155
{
5256
struct itimerspec its;
5357

54-
its.it_value.tv_sec = 1; its.it_value.tv_nsec = 0;
58+
its.it_value.tv_sec = wdt_warn_blocked_traffic_ns / 1000000000U;
59+
its.it_value.tv_nsec = wdt_warn_blocked_traffic_ns % 1000000000U;
5560
its.it_interval.tv_sec = 0; its.it_interval.tv_nsec = 0;
5661
return timer_settime(per_thread_wd_ctx[thr].timer, 0, &its, NULL) == 0;
5762
}
@@ -87,7 +92,7 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
8792
/* not yet reached the deadline of 1 sec,
8893
* or p wasn't initialized yet
8994
*/
90-
if (!p || n - p < 1000000000UL)
95+
if (!p)
9196
goto update_and_leave;
9297

9398
if ((_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_SLEEPING) ||
@@ -115,11 +120,22 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
115120
if (!(_HA_ATOMIC_LOAD(&ha_thread_ctx[thr].flags) & TH_FL_STUCK)) {
116121
uint prev_ctxsw;
117122

118-
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
119123
prev_ctxsw = HA_ATOMIC_LOAD(&per_thread_wd_ctx[tid].prev_ctxsw);
120-
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
121-
ha_stuck_warning(thr);
122-
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
124+
125+
/* only after one second it's clear we're stuck */
126+
if (n - p >= 1000000000ULL)
127+
_HA_ATOMIC_OR(&ha_thread_ctx[thr].flags, TH_FL_STUCK);
128+
129+
/* have we crossed the warning boundary ? If so we note were we
130+
* where, and second time called from the same place will trigger
131+
* a warning (unless already stuck).
132+
*/
133+
if (n - p >= (ullong)wdt_warn_blocked_traffic_ns) {
134+
if (HA_ATOMIC_LOAD(&activity[thr].ctxsw) == prev_ctxsw)
135+
ha_stuck_warning(thr);
136+
HA_ATOMIC_STORE(&activity[thr].ctxsw, prev_ctxsw);
137+
}
138+
123139
goto update_and_leave;
124140
}
125141

@@ -163,6 +179,44 @@ void wdt_handler(int sig, siginfo_t *si, void *arg)
163179
wdt_ping(thr);
164180
}
165181

182+
/* parse the "warn-blocked-traffic-after" parameter */
183+
static int wdt_parse_warn_blocked(char **args, int section_type, struct proxy *curpx,
184+
const struct proxy *defpx, const char *file, int line,
185+
char **err)
186+
{
187+
const char *res;
188+
uint value;
189+
190+
if (!*args[1]) {
191+
memprintf(err, "'%s' expects <time> as argument between 1 and 1000 ms.\n", args[0]);
192+
return -1;
193+
}
194+
195+
res = parse_time_err(args[1], &value, TIME_UNIT_MS);
196+
if (res == PARSE_TIME_OVER) {
197+
memprintf(err, "timer overflow in argument '%s' to '%s' (maximum value is 1000 ms)",
198+
args[1], args[0]);
199+
return -1;
200+
}
201+
else if (res == PARSE_TIME_UNDER) {
202+
memprintf(err, "timer underflow in argument '%s' to '%s' (minimum value is 1 ms)",
203+
args[1], args[0]);
204+
return -1;
205+
}
206+
else if (res) {
207+
memprintf(err, "unexpected character '%c' in argument to <%s>.\n", *res, args[0]);
208+
return -1;
209+
}
210+
else if (value > 1000 || value < 1) {
211+
memprintf(err, "timer out of range in argument '%s' to '%s' (value must be between 1 and 1000 ms)",
212+
args[1], args[0]);
213+
return -1;
214+
}
215+
216+
wdt_warn_blocked_traffic_ns = value * 1000000U;
217+
return 0;
218+
}
219+
166220
int init_wdt_per_thread()
167221
{
168222
if (!clock_setup_signal_timer(&per_thread_wd_ctx[tid].timer, WDTSIG, tid))
@@ -202,6 +256,12 @@ int init_wdt()
202256
return ERR_NONE;
203257
}
204258

259+
static struct cfg_kw_list cfg_kws = {ILH, {
260+
{ CFG_GLOBAL, "warn-blocked-traffic-after", wdt_parse_warn_blocked },
261+
{ 0, NULL, NULL },
262+
}};
263+
264+
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
205265
REGISTER_POST_CHECK(init_wdt);
206266
REGISTER_PER_THREAD_INIT(init_wdt_per_thread);
207267
REGISTER_PER_THREAD_DEINIT(deinit_wdt_per_thread);

0 commit comments

Comments
 (0)