Skip to content

Commit 06364a3

Browse files
committed
MB-51665 Don't raise false diskspace worker stuck
We periodically run a check of diskspace and that check can possibly hang and so we have logic to detect if it has hung. In addition we might be running on a laptop which has been closed and then opened after some time. We currently cannot tell if the lack of a periodic diskspace check is due to it being wedged or due to the time jumping. We now will not declare the diskspace check to be wedged until it has not updated the diskspace info for three times the periodic interval. If we wake up from a laptop being reopened then enough time may have elapsed such that we mark the diskspace check as pending stale and when the second interval elapses and the check doesn't occur we assume it's wedged. We allow three misses to account for possible races between the process that updates the last checked time and the process doing the check. Change-Id: I3e227f8588000eee5a3fc5d0afcf62fb18c21f3a Reviewed-on: https://review.couchbase.org/c/ns_server/+/174401 Well-Formed: Restriction Checker Well-Formed: Build Bot <[email protected]> Tested-by: Build Bot <[email protected]> Reviewed-by: Timofey Barmin <[email protected]> Reviewed-by: Bryan McCoid <[email protected]>
1 parent 00d00b0 commit 06364a3

File tree

1 file changed

+22
-9
lines changed

1 file changed

+22
-9
lines changed

src/menelaus_web_alerts_srv.erl

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
%% needed to mock ns_config in tests
1616
-include("ns_config.hrl").
1717

18+
-define(MAX_DISK_USAGE_MISSED_CHECKS, 2).
19+
1820
-ifdef(TEST).
1921
-include_lib("eunit/include/eunit.hrl").
2022
-endif.
@@ -304,15 +306,26 @@ check(ip, Opaque, _History, _Stats) ->
304306
Opaque;
305307

306308
check(disk_usage_analyzer_stuck, Opaque, _History, _Stats) ->
307-
case ns_disksup:is_stale() of
308-
true ->
309-
global_alert(disk_usage_analyzer_stuck,
310-
fmt_to_bin(
311-
errors(disk_usage_analyzer_stuck), [node()]));
312-
false -> ok
313-
end,
314-
315-
Opaque;
309+
IsStale = ns_disksup:is_stale(),
310+
Missed = case dict:find(disk_usage_missed_checks, Opaque) of
311+
{ok, M} -> M;
312+
error -> 0
313+
end,
314+
315+
NewMissed = case {IsStale, Missed} of
316+
{false, _} ->
317+
0;
318+
{true, Missed}
319+
when Missed < ?MAX_DISK_USAGE_MISSED_CHECKS ->
320+
Missed + 1;
321+
{true, Missed} ->
322+
global_alert(disk_usage_analyzer_stuck,
323+
fmt_to_bin(
324+
errors(disk_usage_analyzer_stuck),
325+
[node()])),
326+
Missed + 1
327+
end,
328+
dict:store(disk_usage_missed_checks, NewMissed, Opaque);
316329

317330
%% @doc check the capacity of the drives used for db and log files
318331
check(disk, Opaque, _History, _Stats) ->

0 commit comments

Comments
 (0)