Skip to content

Commit 261ac6a

Browse files
anuthanAliaksey Artamonau
authored andcommitted
Rebalance Stage information is now available through tasks API.
Helps with, MB-27463: Per-node details for certain stages MB-12000: Delta recovery information Part of EPIC, MB-30894: Rebalance visibility and reporting Change-Id: I42007b66a3664000be1aeff0bd18bc2d2ab4eba9 Reviewed-on: http://review.couchbase.org/101818 Tested-by: Abhijeeth Nuthan <[email protected]> Well-Formed: Build Bot <[email protected]> Reviewed-by: Aliaksey Artamonau <[email protected]>
1 parent 6c60d50 commit 261ac6a

File tree

3 files changed

+262
-40
lines changed

3 files changed

+262
-40
lines changed

src/ns_doctor.erl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,7 @@ do_build_rebalance_task(Timeout) ->
794794
case (catch ns_orchestrator:rebalance_progress_full(Timeout)) of
795795
{running, PerNode} ->
796796
DetailedProgress = get_detailed_progress(),
797+
StageInfo = ns_rebalance_observer:get_stage_info(),
797798

798799
Subtype = case ns_config:search(rebalancer_pid) =:= ns_config:search(graceful_failover_pid) of
799800
true ->
@@ -816,7 +817,8 @@ do_build_rebalance_task(Timeout) ->
816817
{perNode,
817818
{struct, [{Node, {struct, [{progress, Progress * 100}]}}
818819
|| {Node, Progress} <- PerNode]}},
819-
{detailedProgress, DetailedProgress}];
820+
{detailedProgress, DetailedProgress},
821+
{stageInfo, StageInfo}];
820822
FullProgress ->
821823
[{type, rebalance},
822824
{status, notRunning},

src/ns_rebalance_observer.erl

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
-export([start_link/3,
2424
get_detailed_progress/0,
2525
get_aggregated_progress/1,
26+
get_stage_info/0,
27+
update_stage_info/2,
2628
update_progress/2]).
2729

2830
%% gen_server callbacks
@@ -71,9 +73,15 @@ get_detailed_progress() ->
7173
get_aggregated_progress(Timeout) ->
7274
generic_get_call(get_aggregated_progress, Timeout).
7375

76+
get_stage_info() ->
77+
generic_get_call(get_stage_info).
78+
7479
update_progress(Stage, StageProgress) ->
7580
gen_server:cast(?SERVER, {update_progress, Stage, StageProgress}).
7681

82+
update_stage_info(Stage, StageInfo) ->
83+
gen_server:cast(?SERVER, {update_stage_info, Stage, StageInfo}).
84+
7785
is_interesting_master_event({_, bucket_rebalance_started, _Bucket, _Pid}) ->
7886
fun handle_bucket_rebalance_started/2;
7987
is_interesting_master_event({_, set_ff_map, _BucketName, _Diff}) ->
@@ -82,10 +90,39 @@ is_interesting_master_event({_, vbucket_move_start, _Pid, _BucketName, _Node, _V
8290
fun handle_vbucket_move_start/2;
8391
is_interesting_master_event({_, vbucket_move_done, _BucketName, _VBucketId}) ->
8492
fun handle_vbucket_move_done/2;
93+
is_interesting_master_event({_, rebalance_stage_started, _Stage}) ->
94+
fun handle_rebalance_stage_started/2;
95+
is_interesting_master_event({_, rebalance_stage_completed, _Stage}) ->
96+
fun handle_rebalance_stage_completed/2;
97+
is_interesting_master_event({_, rebalance_stage_event, _Stage, _Event}) ->
98+
fun handle_rebalance_stage_event/2;
8599
is_interesting_master_event(_) ->
86100
undefined.
87101

88-
init({Stages, NodesInfo, Type}) ->
102+
possible_substages(kv, NodesInfo) ->
103+
case proplists:get_value(delta_nodes, NodesInfo, []) of
104+
[] ->
105+
[];
106+
DeltaNodes ->
107+
[{kv_delta_recovery, DeltaNodes, []}]
108+
end;
109+
possible_substages(_,_) ->
110+
[].
111+
112+
get_stage_nodes(Services, NodesInfo) ->
113+
ActiveNodes = proplists:get_value(active_nodes, NodesInfo, []),
114+
lists:filtermap(
115+
fun (Service) ->
116+
case ns_cluster_membership:service_nodes(ActiveNodes, Service) of
117+
[] ->
118+
false;
119+
Nodes ->
120+
SubStages = possible_substages(Service, NodesInfo),
121+
{true, {Service, Nodes, SubStages}}
122+
end
123+
end, lists:usort(Services)).
124+
125+
init({Services, NodesInfo, Type}) ->
89126
Self = self(),
90127
ns_pubsub:subscribe_link(master_activity_events,
91128
fun (Event, _Ignored) ->
@@ -97,8 +134,7 @@ init({Stages, NodesInfo, Type}) ->
97134
end
98135
end, []),
99136

100-
{active_nodes, ActiveNodes} = lists:keyfind(active_nodes, 1, NodesInfo),
101-
StageInfo = rebalance_stage_info:init(ActiveNodes, Stages),
137+
StageInfo = rebalance_stage_info:init(get_stage_nodes(Services, NodesInfo)),
102138
BucketsCount = length(ns_bucket:get_buckets()),
103139
proc_lib:spawn_link(erlang, apply, [fun docs_left_updater_init/1, [Self]]),
104140

@@ -119,6 +155,9 @@ handle_call(get_detailed_progress, _From, State) ->
119155
handle_call(get_aggregated_progress, _From,
120156
#state{stage_info = StageInfo} = State) ->
121157
{reply, dict:to_list(rebalance_stage_info:get_progress(StageInfo)), State};
158+
handle_call(get_stage_info, _From,
159+
#state{stage_info = StageInfo} = State) ->
160+
{reply, rebalance_stage_info:get_stage_info(StageInfo), State};
122161
handle_call(Req, From, State) ->
123162
?log_error("Got unknown request: ~p from ~p", [Req, From]),
124163
{reply, unknown_request, State}.
@@ -177,6 +216,10 @@ handle_cast({update_progress, Stage, StageProgress},
177216
NewStageInfo = rebalance_stage_info:update_progress(
178217
Stage, StageProgress, Old),
179218
{noreply, State#state{stage_info = NewStageInfo}};
219+
handle_cast({update_stage_info, Stage, StageInfo},
220+
#state{stage_info = Old} = State) ->
221+
New = rebalance_stage_info:update_stage_info(Stage, StageInfo, Old),
222+
{noreply, State#state{stage_info = New}};
180223

181224
handle_cast(Req, _State) ->
182225
?log_error("Got unknown cast: ~p", [Req]),
@@ -266,6 +309,23 @@ initiate_bucket_rebalance(BucketName, OldState) ->
266309
current_moves = [],
267310
pending_moves = Moves}.
268311

312+
handle_rebalance_stage_started({TS, rebalance_stage_started, Stage},
313+
#state{stage_info = Old} = State) ->
314+
New = rebalance_stage_info:update_stage_info(Stage, {started, TS}, Old),
315+
{noreply, State#state{stage_info = New}}.
316+
317+
handle_rebalance_stage_completed({TS, rebalance_stage_completed, Stage},
318+
#state{stage_info = Old} = State) ->
319+
New = rebalance_stage_info:update_stage_info(Stage, {completed, TS}, Old),
320+
{noreply, State#state{stage_info = New}}.
321+
322+
handle_rebalance_stage_event({TS, rebalance_stage_event, Stage, Text},
323+
#state{stage_info = Old} = State) ->
324+
New = rebalance_stage_info:update_stage_info(Stage,
325+
{notable_event, TS, Text},
326+
Old),
327+
{noreply, State#state{stage_info = New}}.
328+
269329
handle_bucket_rebalance_started({_, bucket_rebalance_started, _BucketName, _Pid},
270330
#state{bucket_number = Number} = State) ->
271331
NewState = State#state{bucket_number=Number + 1},

src/rebalance_stage_info.erl

Lines changed: 196 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,36 +15,69 @@
1515
%%
1616
-module(rebalance_stage_info).
1717

18-
-export([init/2, get_progress/1, update_progress/3]).
18+
-export([init/1,
19+
get_stage_info/1,
20+
get_progress/1,
21+
update_progress/3,
22+
update_stage_info/3,
23+
diff_timestamp/2,
24+
binarify_timestamp/1]).
1925
-export_type([stage_info/0]).
2026

27+
-record(stage_details, {
28+
start_time = false,
29+
complete_time = false,
30+
sub_stages = [] :: [{atom(), #stage_details{}}],
31+
notable_events = []
32+
}).
33+
2134
-record(stage_info, {
2235
per_stage_progress :: dict:dict(),
23-
aggregated :: dict:dict()
36+
%% aggregated needed for backward compatibility.
37+
aggregated :: dict:dict(),
38+
per_stage_info :: [{atom(), #stage_details{}}]
2439
}).
2540

2641
-type stage_info() :: #stage_info{}.
2742

28-
init(LiveNodes, Stages) ->
29-
do_init([{S, ns_cluster_membership:service_nodes(LiveNodes, S)} ||
30-
S <- Stages]).
31-
32-
do_init(Stages) ->
33-
aggregate(init_per_stage(Stages)).
43+
init(Stages) ->
44+
PerStageProgress = dict:from_list(init_per_stage_progress(Stages)),
45+
Aggregated = aggregate(PerStageProgress),
46+
StageInfo = init_per_stage_info(Stages),
47+
#stage_info{per_stage_progress = PerStageProgress,
48+
aggregated = Aggregated,
49+
per_stage_info = StageInfo}.
3450

35-
init_per_stage(Stages) ->
36-
dict:from_list([{Stage, init_stage(Nodes)} ||
37-
{Stage, Nodes} <- Stages]).
51+
init_per_stage_progress(Stages) ->
52+
lists:flatten([init_stage_progress(S, N, SS) || {S, N, SS} <- Stages]).
3853

39-
init_stage(Nodes) ->
40-
dict:from_list([{N, 0} || N <- Nodes]).
54+
init_stage_progress(_Stage, [], _SubStage) ->
55+
[];
56+
init_stage_progress(Stage, Nodes, SubStages) ->
57+
SubStageNodes = init_per_stage_progress(SubStages),
58+
[{Stage, dict:from_list([{N, 0} || N <- Nodes])} | SubStageNodes].
4159

60+
%% For backward compatibility.
4261
get_progress(#stage_info{aggregated = Aggregated}) ->
4362
Aggregated.
4463

45-
update_progress(Stage, StageProgress,
46-
#stage_info{per_stage_progress = PerStage}) ->
47-
aggregate(do_update_progress(Stage, StageProgress, PerStage)).
64+
init_per_stage_info(Stages) ->
65+
[{Stage, #stage_details{
66+
start_time = false,
67+
complete_time = false,
68+
sub_stages = init_per_stage_info(SubStages),
69+
notable_events = []
70+
}} || {Stage, Nodes, SubStages} <- Stages, Nodes =/= []].
71+
72+
update_progress(
73+
Stage, StageProgress,
74+
#stage_info{per_stage_progress = OldPerStageProgress} = StageInfo) ->
75+
NewPerStageProgress = do_update_progress(Stage, StageProgress,
76+
OldPerStageProgress),
77+
Aggregated = aggregate(NewPerStageProgress),
78+
StageInfo#stage_info{
79+
per_stage_progress = NewPerStageProgress,
80+
aggregated = Aggregated}.
4881

4982
do_update_progress(Stage, StageProgress, PerStage) ->
5083
dict:update(Stage,
@@ -55,23 +88,150 @@ do_update_progress(Stage, StageProgress, PerStage) ->
5588
end, PerStage).
5689

5790
aggregate(PerStage) ->
58-
Aggregated0 =
59-
dict:fold(
60-
fun (_, StageProgress, AggAcc) ->
61-
dict:fold(
62-
fun (Node, NodeProgress, Acc) ->
63-
misc:dict_update(
64-
Node,
65-
fun ({Count, Sum}) ->
66-
{Count + 1, Sum + NodeProgress}
67-
end, {0, 0}, Acc)
68-
end, AggAcc, StageProgress)
69-
end, dict:new(), PerStage),
70-
71-
Aggregated =
72-
dict:map(fun (_, {Count, Sum}) ->
73-
Sum / Count
74-
end, Aggregated0),
75-
76-
#stage_info{per_stage_progress = PerStage,
77-
aggregated = Aggregated}.
91+
TmpAggr = dict:fold(
92+
fun (_, StageProgress, AggAcc) ->
93+
dict:fold(
94+
fun (Node, NodeProgress, Acc) ->
95+
misc:dict_update(
96+
Node,
97+
fun ({Count, Sum}) ->
98+
{Count + 1, Sum + NodeProgress}
99+
end, {0, 0}, Acc)
100+
end, AggAcc, StageProgress)
101+
end, dict:new(), PerStage),
102+
103+
dict:map(fun (_, {Count, Sum}) ->
104+
Sum / Count
105+
end, TmpAggr).
106+
107+
get_stage_info(StageInfo) ->
108+
{get_per_stage_info(StageInfo)}.
109+
110+
diff_timestamp(false, false) ->
111+
false;
112+
diff_timestamp(false, StartTS) ->
113+
diff_timestamp(os:timestamp(), StartTS);
114+
diff_timestamp(EndTS, StartTS) ->
115+
round(timer:now_diff(EndTS, StartTS) / 1000).
116+
117+
binarify_timestamp(false) ->
118+
false;
119+
binarify_timestamp(Time) ->
120+
erlang:list_to_binary(misc:timestamp_local_iso8601(Time)).
121+
122+
get_per_stage_info(#stage_info{
123+
per_stage_progress = PerStageProgress,
124+
per_stage_info = PerStageInfo}) ->
125+
AllStageProgress = get_per_stage_progress(PerStageProgress),
126+
lists:map(
127+
fun ({Stage, StageInfo}) ->
128+
construct_per_stage_json(AllStageProgress,
129+
Stage,
130+
StageInfo)
131+
end, PerStageInfo).
132+
133+
construct_per_stage_json(AllStageProgress, Stage, StageInfo) ->
134+
{ok, PerNodeProgress} = dict:find(Stage, AllStageProgress),
135+
TotalStageProgress = case lists:foldl(fun ({_, P}, {Total, Count}) ->
136+
{Total + P, Count + 1}
137+
end, {0, 0}, PerNodeProgress) of
138+
{_, 0} ->
139+
0;
140+
{TotalProgress, NodesCount} ->
141+
TotalProgress * 100.0 / NodesCount
142+
end,
143+
ProgressInfoJson = construct_per_stage_progress_json(TotalStageProgress,
144+
PerNodeProgress,
145+
StageInfo),
146+
StageInfoJson = construct_per_stage_info_json(StageInfo, AllStageProgress),
147+
{get_stage_name(Stage), {ProgressInfoJson ++ StageInfoJson}}.
148+
149+
get_stage_name(Stage) ->
150+
Stages = [{kv, data},
151+
{n1ql, query},
152+
{index, index},
153+
{fts, search},
154+
{cbas, analytics},
155+
{eventing, eventing},
156+
{kv_delta_recovery, deltaRecovery}],
157+
case lists:keyfind(Stage, 1, Stages) of
158+
{Stage, Val} -> Val;
159+
false -> Stage
160+
end.
161+
162+
construct_per_stage_progress_json(
163+
TSP, _, #stage_details{complete_time = false}) when TSP == 0 ->
164+
[];
165+
construct_per_stage_progress_json(TotalStageProgress, PerNodeProgress,
166+
#stage_details{complete_time = false}) ->
167+
[{totalProgress, TotalStageProgress},
168+
{perNodeProgress, {PerNodeProgress}}];
169+
construct_per_stage_progress_json(_, PerNodeProgress, _StageInfo) ->
170+
Completed = [{N, 1.0} || {N, _} <- PerNodeProgress],
171+
[{totalProgress, 100.0},
172+
{perNodeProgress, {Completed}}].
173+
174+
construct_per_stage_info_json(#stage_details{
175+
start_time = StartTime,
176+
complete_time = EndTime,
177+
sub_stages = SubStages,
178+
notable_events = NotableEvents},
179+
AllStageProgress) ->
180+
SubStagesInfo = case SubStages of
181+
[] -> [];
182+
_ -> [{subStages,
183+
{[construct_per_stage_json(AllStageProgress,
184+
SubStage,
185+
SubStageInfo) ||
186+
{SubStage, SubStageInfo} <- SubStages]}}]
187+
end,
188+
Events = case NotableEvents of
189+
[] -> [];
190+
_ -> [{events, {NotableEvents}}]
191+
end,
192+
193+
[{startTime, binarify_timestamp(StartTime)},
194+
{completedTime, binarify_timestamp(EndTime)},
195+
{timeTaken, diff_timestamp(EndTime, StartTime)}] ++ SubStagesInfo ++ Events.
196+
197+
get_per_stage_progress(PerStageProgress) ->
198+
dict:map(fun (_, StageProgress) ->
199+
dict:to_list(StageProgress)
200+
end, PerStageProgress).
201+
202+
update_stage_info({started, Time}, StageInfo) ->
203+
StageInfo#stage_details{start_time = Time,
204+
complete_time = false};
205+
update_stage_info({completed, Time}, StageInfo) ->
206+
StageInfo#stage_details{complete_time = Time};
207+
update_stage_info({notable_event, TS, Text},
208+
#stage_details{notable_events = NotableEvents} = StageInfo) ->
209+
Time = binarify_timestamp(TS),
210+
Msg = list_to_binary(Text),
211+
StageInfo#stage_details{notable_events = [{Time, Msg} | NotableEvents]}.
212+
213+
update_stage_info(Stage, StageInfoUpdate,
214+
#stage_info{per_stage_info = PerStageInfo} = StageInfo) ->
215+
NewPerStageInfo = update_stage_info_rec(Stage, StageInfoUpdate,
216+
PerStageInfo),
217+
StageInfo#stage_info{per_stage_info = NewPerStageInfo}.
218+
219+
update_stage_info_rec([Stage | SubStages], StageInfoUpdate, AllStageInfo) ->
220+
case lists:keysearch(Stage, 1, AllStageInfo) of
221+
false ->
222+
AllStageInfo;
223+
{value, {Stage, OldStageInfo}} ->
224+
NewStageInfo =
225+
case SubStages of
226+
[] ->
227+
update_stage_info(StageInfoUpdate, OldStageInfo);
228+
_ ->
229+
NewSubStages = update_stage_info_rec(
230+
SubStages,
231+
StageInfoUpdate,
232+
OldStageInfo#stage_details.sub_stages),
233+
234+
OldStageInfo#stage_details{sub_stages = NewSubStages}
235+
end,
236+
lists:keyreplace(Stage, 1, AllStageInfo, {Stage, NewStageInfo})
237+
end.

0 commit comments

Comments
 (0)