Skip to content

Commit 32df67d

Browse files
committed
Merge remote-tracking branch 'couchbase/trinity' into phoenix
* MB-67082: Add online eviction policy test cases * MB-67082: Add eviction_policy tests for storage migration * MB-67758: Support UI pop ups for auto-failover alerts * MB-67082: Add support for online eviction_policy changes * MB-67082: Add --no-restart option for eviction policy changes Change-Id: Ic4374cc178ee8f6afa0e52825c75ce82ff573b00
2 parents 9a3fd20 + 87a79eb commit 32df67d

File tree

9 files changed

+1260
-191
lines changed

9 files changed

+1260
-191
lines changed

apps/ns_server/src/auto_failover.erl

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -212,9 +212,10 @@ cast(Call) ->
212212
misc:wait_for_global_name(?MODULE),
213213
gen_server:cast(?SERVER, Call).
214214

215-
-define(log_info_and_email(Alert, Fmt, Args),
216-
ale:info(?USER_LOGGER, Fmt, Args),
217-
ns_email_alert:alert(Alert, Fmt, Args)).
215+
-define(log_and_alert(Alert, Fmt, Args),
216+
Message = list_to_binary(lists:flatten(io_lib:format(Fmt, Args))),
217+
%% The alert server sends emails, raises pop ups, etc as needed.
218+
menelaus_web_alerts_srv:global_alert(Alert, Message)).
218219

219220
%% @doc Returns a list of all alerts that might send out an email notification.
220221
-spec alert_keys() -> [atom()].
@@ -550,7 +551,7 @@ trim_nodes_for_failover(Nodes, S) ->
550551

551552
process_action({mail_too_small, Service, SvcNodes, {Node, _UUID}},
552553
S, _, _, _) ->
553-
?log_info_and_email(
554+
?log_and_alert(
554555
auto_failover_cluster_too_small,
555556
"Could not auto-failover node (~p). "
556557
"Number of remaining nodes that are running ~s service is ~p. "
@@ -561,28 +562,28 @@ process_action({mail_too_small, Service, SvcNodes, {Node, _UUID}},
561562
auto_failover_logic:service_failover_min_node_count()]),
562563
S;
563564
process_action({mail_down_warning, {Node, _UUID}}, S, _, _, _) ->
564-
?log_info_and_email(
565+
?log_and_alert(
565566
auto_failover_other_nodes_down,
566567
"Could not auto-failover node (~p). "
567568
"There was at least another node down.",
568569
[Node]),
569570
S;
570571
process_action({mail_down_warning_multi_node, {Node, _UUID}}, S, _, _, _) ->
571-
?log_info_and_email(
572+
?log_and_alert(
572573
auto_failover_other_nodes_down,
573574
"Could not auto-failover node (~p). "
574575
"The list of nodes being down has changed.",
575576
[Node]),
576577
S;
577578
process_action({mail_auto_failover_disabled, Service, {Node, _}}, S, _, _, _) ->
578-
?log_info_and_email(
579+
?log_and_alert(
579580
auto_failover_disabled,
580581
"Could not auto-failover node (~p). "
581582
"Auto-failover for ~s service is disabled.",
582583
[Node, ns_cluster_membership:user_friendly_service_name(Service)]),
583584
S;
584585
process_action({mail_kv_not_fully_failed_over, {Node, _}}, S, _, _, _) ->
585-
?log_info_and_email(
586+
?log_and_alert(
586587
auto_failover_other_nodes_down,
587588
"Could not auto-failover service node (~p). "
588589
"One of the data service nodes cannot be automatically failed over.",
@@ -644,12 +645,12 @@ maybe_report_max_node_reached(AllNodes, NotFailedOver, ErrMsg, S) ->
644645
true ->
645646
case AllNodes -- NotFailedOver of
646647
[] ->
647-
?log_info_and_email(
648+
?log_and_alert(
648649
auto_failover_maximum_reached,
649650
"Could not auto-failover more nodes (~p). ~s",
650651
[NotFailedOver, ErrMsg]);
651652
RemainingNodes ->
652-
?log_info_and_email(
653+
?log_and_alert(
653654
auto_failover_maximum_reached,
654655
"Could not auto-failover nodes (~p). ~s Continuing to "
655656
"auto-failover nodes ~p",
@@ -685,12 +686,12 @@ log_failover_success(Node, DownNodes, NodeStatuses) ->
685686
case failover_reason(Node, DownNodes, NodeStatuses) of
686687
{Reason, MA} ->
687688
master_activity_events:note_autofailover_done(Node, MA),
688-
?log_info_and_email(
689+
?log_and_alert(
689690
auto_failover_node,
690691
"Node (~p) was automatically failed over. Reason: ~s",
691692
[Node, Reason]);
692693
Reason ->
693-
?log_info_and_email(
694+
?log_and_alert(
694695
auto_failover_node,
695696
"Node (~p) was automatically failed over.~n~p", [Node, Reason])
696697

@@ -717,7 +718,7 @@ log_unsafe_node({Node, {Service, Error}}, State) ->
717718
Flag = {Node, Service, Error},
718719
case should_report(Flag, State) of
719720
true ->
720-
?log_info_and_email(
721+
?log_and_alert(
721722
auto_failover_node,
722723
"Could not automatically fail over node (~p) due to operation "
723724
"being unsafe for service ~p. ~s",
@@ -777,7 +778,7 @@ process_failover_error(last_node, Nodes, S) ->
777778
report_failover_error(Flag, ErrMsg, Nodes, State) ->
778779
case should_report(Flag, State) of
779780
true ->
780-
?log_info_and_email(
781+
?log_and_alert(
781782
auto_failover_node,
782783
"Could not automatically fail over nodes (~p). ~s",
783784
[Nodes, ErrMsg]),

apps/ns_server/src/memcached_bucket_config.erl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ get_continuous_backup_interval(BucketConfig) ->
136136
get_eviction_policy(Persistent, BucketConfig) ->
137137
case ns_bucket:is_persistent(BucketConfig) of
138138
Persistent ->
139-
case ns_bucket:eviction_policy(BucketConfig) of
139+
case ns_bucket:node_eviction_policy(BucketConfig) of
140140
nru_eviction ->
141141
auto_delete;
142142
no_eviction ->

apps/ns_server/src/menelaus_web_buckets.erl

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -803,7 +803,13 @@ handle_bucket_update_inner(BucketId, Req, Params, Limit) ->
803803
{false, _, {ok, ParsedProps, _}} ->
804804
BucketType = proplists:get_value(bucketType, ParsedProps),
805805
UpdatedProps = ns_bucket:extract_bucket_props(ParsedProps),
806-
case update_bucket(Ctx, BucketId, BucketType, UpdatedProps, Req) of
806+
Options =
807+
case proplists:get_value(no_restart, ParsedProps, false) of
808+
true -> [no_restart];
809+
false -> []
810+
end,
811+
case update_bucket(Ctx, BucketId, BucketType, UpdatedProps,
812+
Options, Req) of
807813
retry ->
808814
handle_bucket_update_inner(BucketId, Req, Params,
809815
Limit - 1);
@@ -849,6 +855,10 @@ storage_mode_migration_error(history_retention_enabled_on_bucket) ->
849855
{"Cannot migrate storage mode. history_retention enabled on bucket.", 400};
850856
storage_mode_migration_error(history_retention_enabled_on_collections) ->
851857
{"Cannot migrate storage mode. history_retention enabled on collections.",
858+
400};
859+
storage_mode_migration_error(eviction_policy_no_restart_required) ->
860+
{"Eviction policy changes during storage mode migration require "
861+
"--no-restart option.",
852862
400}.
853863

854864
reply_storage_mode_migration_error(Req, Error) ->
@@ -875,14 +885,10 @@ maybe_update_cas_props(BucketId, BucketConfig, UpdatedProps, true = _CcvEn) ->
875885
maybe_update_cas_props(_, _, UpdatedProps, false = _CcEn) ->
876886
{ok, UpdatedProps}.
877887

878-
update_via_orchestrator(Req, BucketId, StorageMode, BucketType, UpdatedProps) ->
879-
update_via_orchestrator(Req, BucketId, StorageMode, BucketType,
880-
UpdatedProps, true).
881-
882888
update_via_orchestrator(Req, BucketId, StorageMode, BucketType, UpdatedProps,
883-
CanRetry) ->
889+
CanRetry, Options) ->
884890
case ns_orchestrator:update_bucket(BucketType, StorageMode,
885-
BucketId, UpdatedProps) of
891+
BucketId, UpdatedProps, Options) of
886892
ok ->
887893
ns_audit:modify_bucket(Req, BucketId, BucketType, UpdatedProps),
888894
DisplayBucketType = ns_bucket:display_type(BucketType,
@@ -911,7 +917,11 @@ update_via_orchestrator(Req, BucketId, StorageMode, BucketType, UpdatedProps,
911917
{error, {storage_mode_migration, janitor_not_run}} when CanRetry ->
912918
ns_orchestrator:ensure_janitor_run({bucket, BucketId}, 5000),
913919
update_via_orchestrator(Req, BucketId, StorageMode, BucketType,
914-
UpdatedProps, false);
920+
UpdatedProps, false, Options);
921+
{error, {eviction_policy_change, janitor_not_run}} when CanRetry ->
922+
ns_orchestrator:ensure_janitor_run({bucket, BucketId}, 5000),
923+
update_via_orchestrator(Req, BucketId, StorageMode, BucketType,
924+
UpdatedProps, false, Options);
915925
{error, {storage_mode_migration, Error}} ->
916926
reply_storage_mode_migration_error(Req, Error);
917927
{error, secret_not_found} ->
@@ -930,7 +940,7 @@ update_via_orchestrator(Req, BucketId, StorageMode, BucketType, UpdatedProps,
930940
end
931941
end.
932942

933-
update_bucket(Ctx, BucketId, BucketType, UpdatedProps, Req) ->
943+
update_bucket(Ctx, BucketId, BucketType, UpdatedProps, Options, Req) ->
934944
#bv_ctx{bucket_config = BucketConfig} = Ctx,
935945
StorageMode = ns_bucket:storage_mode(BucketConfig),
936946
CcvEn = proplists:get_value(cross_cluster_versioning_enabled,
@@ -939,7 +949,7 @@ update_bucket(Ctx, BucketId, BucketType, UpdatedProps, Req) ->
939949
case maybe_update_cas_props(BucketId, BucketConfig, UpdatedProps, CcvEn) of
940950
{ok, UpdateProps1} ->
941951
update_via_orchestrator(Req, BucketId, StorageMode, BucketType,
942-
UpdateProps1);
952+
UpdateProps1, true, Options);
943953
{error, max_cas_vbucket_retrieval_no_map} ->
944954
reply_text(Req, "Unable to retrieve max_cas due to no vBucket map",
945955
503);
@@ -1625,7 +1635,8 @@ basic_bucket_params_screening(Ctx, Params) ->
16251635
CommonParams = validate_common_params(Ctx, Params),
16261636
TypeSpecificParams =
16271637
validate_bucket_type_specific_params(CommonParams, Params, Ctx),
1628-
Candidates = CommonParams ++ TypeSpecificParams,
1638+
NoRestartParam = parse_validate_no_restart(Params),
1639+
Candidates = CommonParams ++ TypeSpecificParams ++ [NoRestartParam],
16291640
assert_candidates(Candidates),
16301641
%% Basic parameter checking has been done. Take the non-error key/values
16311642
%% and do additional checking (e.g. relationships between different
@@ -3730,6 +3741,38 @@ parse_validate_workload_pattern_default(Params) ->
37303741
"'mixed'">>}
37313742
end.
37323743

3744+
parse_validate_no_restart(Params) ->
3745+
case proplists:get_value("noRestart", Params) of
3746+
undefined ->
3747+
ignore;
3748+
Value ->
3749+
case ns_config:read_key_fast(allow_online_eviction_policy_change,
3750+
false) of
3751+
true ->
3752+
case menelaus_util:parse_validate_boolean(Value) of
3753+
{ok, BoolValue} ->
3754+
case proplists:get_value("evictionPolicy",
3755+
Params) of
3756+
undefined ->
3757+
{error, no_restart,
3758+
<<"noRestart option has no effect unless "
3759+
"evictionPolicy is being changed">>};
3760+
_ ->
3761+
{ok, no_restart, BoolValue}
3762+
end;
3763+
_Error ->
3764+
{error, no_restart,
3765+
<<"noRestart must be a boolean (true/false)">>}
3766+
end;
3767+
false ->
3768+
{error, no_restart,
3769+
<<"noRestart option is not supported. Enable "
3770+
"allow_online_eviction_policy_change to use this "
3771+
"feature">>}
3772+
end
3773+
end.
3774+
3775+
37333776
handle_compact_bucket(_PoolId, Bucket, Req) ->
37343777
ok = compaction_api:force_compact_bucket(Bucket),
37353778
reply(Req, 200).

apps/ns_server/src/menelaus_web_node.erl

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,20 @@ do_build_nodes_info_fun(#ctx{ns_config = Config,
424424
fun (_, _) -> [] end
425425
end,
426426

427+
PerNodeEvictionPolicyBuilder =
428+
case WithBucket of
429+
true ->
430+
fun (_Node, undefined) ->
431+
[];
432+
(Node, BucketName) ->
433+
{ok, BucketConfig} =
434+
ns_bucket:get_bucket(BucketName, Snapshot),
435+
build_eviction_policy(Node, BucketConfig)
436+
end;
437+
false ->
438+
fun (_, _) -> [] end
439+
end,
440+
427441
fun(WantENode, Bucket) ->
428442
InfoNode = ns_doctor:get_node(WantENode, NodeStatuses),
429443
StableInfo =
@@ -448,7 +462,8 @@ do_build_nodes_info_fun(#ctx{ns_config = Config,
448462
build_failover_status(Snapshot, WantENode),
449463
LimitsAndBucketPlacerInfoBuilder(WantENode)] ++
450464
build_encryption_at_rest_info(Bucket, Snapshot, InfoNode) ++
451-
PerNodeStorageBackendBuilder(WantENode, Bucket),
465+
PerNodeStorageBackendBuilder(WantENode, Bucket) ++
466+
PerNodeEvictionPolicyBuilder(WantENode, Bucket),
452467

453468
NodeHash = erlang:phash2(StableInfo),
454469

@@ -505,6 +520,28 @@ build_storage_backend(Node, BucketConfig) ->
505520
[{storageBackend, NodeStorageBackend}]
506521
end.
507522

523+
build_eviction_policy(Node, BucketConfig) ->
524+
case ns_config:read_key_fast(allow_online_eviction_policy_change, false) of
525+
true ->
526+
NodeEvictionPolicy = ns_bucket:node_eviction_policy_override(
527+
Node, BucketConfig),
528+
case NodeEvictionPolicy of
529+
undefined ->
530+
[];
531+
_ ->
532+
EvictionPolicyBinary =
533+
case NodeEvictionPolicy of
534+
value_only -> <<"valueOnly">>;
535+
full_eviction -> <<"fullEviction">>;
536+
no_eviction -> <<"noEviction">>;
537+
nru_eviction -> <<"nruEviction">>
538+
end,
539+
[{evictionPolicy, EvictionPolicyBinary}]
540+
end;
541+
false ->
542+
[]
543+
end.
544+
508545
build_failover_status(Snapshot, Node) ->
509546
PrevFailoverNodes = chronicle_master:get_prev_failover_nodes(Snapshot),
510547
case lists:member(Node, PrevFailoverNodes) of

0 commit comments

Comments
 (0)