Skip to content

Commit c845c11

Browse files
committed
Introduce new master activity events for ...
1. Signifying start of a stage of rebalance. 2. Signifying completion of a stage of rebalance. 3. Signifying a notable event that does not cause rebalance to fail, during a particular stage. A stage of rebalance can be a nested stage, for example, delta recovery is a sub stage of kv rebalance stage. Helps with, MB-27463: Per-node details for certain stages MB-12000: Delta recovery information Part of EPIC, MB-30894: Rebalance visibility and reporting Change-Id: I28966768ecf3a3f6d832f131a414629a5b425005 Reviewed-on: http://review.couchbase.org/101816 Reviewed-by: Aliaksey Artamonau <[email protected]> Tested-by: Abhijeeth Nuthan <[email protected]>
1 parent 6a85a17 commit c845c11

File tree

3 files changed

+60
-4
lines changed

3 files changed

+60
-4
lines changed

src/master_activity_events.erl

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,31 @@
5959
note_set_service_map/2,
6060
note_autofailover_node_state_change/4,
6161
note_autofailover_server_group_state_change/4,
62-
note_autofailover_done/2
62+
note_autofailover_done/2,
63+
note_rebalance_stage_started/1,
64+
note_rebalance_stage_completed/1,
65+
note_rebalance_stage_event/2
6366
]).
6467

6568
-export([stream_events/2]).
6669

6770
submit_cast(Arg) ->
6871
(catch gen_event:notify(master_activity_events_ingress, {submit_master_event, Arg})).
6972

73+
get_stage_list(Stage) when is_atom(Stage) ->
74+
[Stage];
75+
get_stage_list(Stage) when is_list(Stage) ->
76+
Stage.
77+
78+
note_rebalance_stage_started(Stage) ->
79+
submit_cast({rebalance_stage_started, get_stage_list(Stage)}).
80+
81+
note_rebalance_stage_completed(Stage) ->
82+
submit_cast({rebalance_stage_completed, get_stage_list(Stage)}).
83+
84+
note_rebalance_stage_event(Stage, Text) ->
85+
submit_cast({rebalance_stage_event, get_stage_list(Stage), Text}).
86+
7087
note_vbucket_state_change(Bucket, Node, VBucketId, NewState) ->
7188
submit_cast({vbucket_state_change, Bucket, Node, VBucketId, NewState}).
7289

@@ -375,6 +392,22 @@ maybe_get_pids_node(Pid) when is_pid(Pid) ->
375392
maybe_get_pids_node(_PerhapsBinary) ->
376393
skip_this_pair_please.
377394

395+
event_to_jsons({TS, rebalance_stage_started, Stage}) ->
396+
[format_simple_plist_as_json([{type, rebalanceStageStarted},
397+
{ts, misc:time_to_epoch_float(TS)},
398+
{stage, {list, Stage}}])];
399+
400+
event_to_jsons({TS, rebalance_stage_completed, Stage}) ->
401+
[format_simple_plist_as_json([{type, rebalanceStageCompleted},
402+
{ts, misc:time_to_epoch_float(TS)},
403+
{stage, {list, Stage}}])];
404+
405+
event_to_jsons({TS, rebalance_stage_event, Stage, Text}) ->
406+
[format_simple_plist_as_json([{type, rebalanceStageEvent},
407+
{ts, misc:time_to_epoch_float(TS)},
408+
{stage, {list, Stage}},
409+
{event, Text}])];
410+
378411
event_to_jsons({TS, vbucket_state_change, Bucket, Node, VBucketId, NewState}) ->
379412
[format_simple_plist_as_json([{type, vbucketStateChange},
380413
{ts, misc:time_to_epoch_float(TS)},

src/ns_rebalancer.erl

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -540,9 +540,13 @@ rebalance_simple_services(Config, Services, KeepNodes) ->
540540
true ->
541541
lists:filtermap(
542542
fun (Service) ->
543+
master_activity_events:note_rebalance_stage_started(
544+
Service),
543545
ServiceNodes = ns_cluster_membership:service_nodes(KeepNodes, Service),
544546
Updated = update_service_map_with_config(Config, Service, ServiceNodes),
545547

548+
master_activity_events:note_rebalance_stage_completed(
549+
Service),
546550
case Updated of
547551
false ->
548552
false;
@@ -596,10 +600,14 @@ rebalance_topology_aware_services(Config, Services, KeepNodesAll, EjectNodesAll)
596600
[] ->
597601
false;
598602
_ ->
603+
master_activity_events:note_rebalance_stage_started(
604+
Service),
599605
update_service_map_with_config(Config, Service, AllNodes),
600606
ok = rebalance_topology_aware_service(Service, KeepNodes,
601607
EjectNodes, DeltaNodes),
602608
update_service_map(Service, AllNodes, KeepNodes),
609+
master_activity_events:note_rebalance_stage_completed(
610+
Service),
603611
{true, {Service, os:timestamp()}}
604612
end
605613
end, Services).
@@ -708,6 +716,7 @@ rebalance_body(KeepNodes,
708716

709717
ok = drop_old_2i_indexes(KeepNodes),
710718

719+
master_activity_events:note_rebalance_stage_started(kv),
711720
%% wait till all bucket shutdowns are done on nodes we're
712721
%% adding (or maybe adding).
713722
do_wait_buckets_shutdown(KeepNodes),
@@ -721,9 +730,13 @@ rebalance_body(KeepNodes,
721730
run_janitor_pre_rebalance(Bucket)
722731
end, BucketConfigs),
723732

733+
master_activity_events:note_rebalance_stage_started(
734+
[kv, kv_delta_recovery]),
724735
ok = apply_delta_recovery_buckets(DeltaRecoveryBuckets,
725736
KVDeltaNodes, BucketConfigs),
726737
ok = maybe_clear_recovery_type(KeepNodes),
738+
master_activity_events:note_rebalance_stage_completed(
739+
[kv, kv_delta_recovery]),
727740
ok = service_janitor:cleanup(),
728741

729742
ok = leader_activities:activate_quorum_nodes(KeepNodes),
@@ -738,6 +751,7 @@ rebalance_body(KeepNodes,
738751
ok = check_test_condition(rebalance_cluster_nodes_active),
739752

740753
rebalance_kv(KeepNodes, EjectNodesAll, BucketConfigs, DeltaRecoveryBuckets),
754+
master_activity_events:note_rebalance_stage_completed(kv),
741755
rebalance_services(KeepNodes, EjectNodesAll),
742756

743757
ok = leader_activities:deactivate_quorum_nodes(EjectNodesAll),
@@ -1357,10 +1371,13 @@ do_run_graceful_failover_moves(Nodes, BucketName, BucketConfig, I, N) ->
13571371
Map = proplists:get_value(map, BucketConfig, []),
13581372
Map1 = mb_map:promote_replicas_for_graceful_failover(Map, Nodes),
13591373

1374+
master_activity_events:note_rebalance_stage_started(kv),
13601375
ProgressFun = make_progress_fun(I, N),
1361-
run_mover(BucketName, BucketConfig,
1362-
proplists:get_value(servers, BucketConfig),
1363-
ProgressFun, Map, Map1).
1376+
RV = run_mover(BucketName, BucketConfig,
1377+
proplists:get_value(servers, BucketConfig),
1378+
ProgressFun, Map, Map1),
1379+
master_activity_events:note_rebalance_stage_completed(kv),
1380+
RV.
13641381

13651382
check_graceful_failover_possible(Nodes, BucketsAll) ->
13661383
%% No graceful failovers for non KV node

src/ns_vbucket_mover.erl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,12 @@ spawn_compaction_uninhibitor(Bucket, Node, MRef) ->
294294
master_activity_events:note_compaction_uninhibit_done(Bucket, Node),
295295
ok;
296296
nack ->
297+
Msg = io_lib:format(
298+
"failed to initiate compaction for "
299+
"bucket ~p on node ~p",
300+
[Bucket, Node]),
301+
master_activity_events:note_rebalance_stage_event(
302+
kv, Msg),
297303
erlang:exit({failed_to_initiate_compaction, Bucket, Node, MRef})
298304
end;
299305
_ ->

0 commit comments

Comments
 (0)