Skip to content

Commit 4806adf

Browse files
author
Erlang/OTP
committed
Merge branch 'lukas/kernel/fix-application-shutdown-deadlock/OTP-19078' into maint-26
* lukas/kernel/fix-application-shutdown-deadlock/OTP-19078: kernel: Fix group to not crash if group_history fails kernel: Fix AC to respond to calls while terminating
2 parents 810d89e + 172a15c commit 4806adf

File tree

7 files changed

+131
-46
lines changed

7 files changed

+131
-46
lines changed

lib/kernel/src/application_controller.erl

Lines changed: 51 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -209,10 +209,10 @@ start(KernelApp) ->
209209
%% Returns: ok | {error, Reason}
210210
%%-----------------------------------------------------------------
211211
load_application(Application) ->
212-
gen_server:call(?AC, {load_application, Application}, infinity).
212+
call({load_application, Application}, infinity).
213213

214214
unload_application(AppName) ->
215-
gen_server:call(?AC, {unload_application, AppName}, infinity).
215+
call({unload_application, AppName}, infinity).
216216

217217
%%-----------------------------------------------------------------
218218
%% Func: start_application/2
@@ -236,7 +236,7 @@ unload_application(AppName) ->
236236
%% Returns: ok | {error, Reason}
237237
%%-----------------------------------------------------------------
238238
start_application(AppName, RestartType) ->
239-
gen_server:call(?AC, {start_application, AppName, RestartType}, infinity).
239+
call({start_application, AppName, RestartType}, infinity).
240240

241241
start_application_request(AppName, RestartType) ->
242242
gen_server:send_request(?AC, {start_application, AppName, RestartType}).
@@ -250,7 +250,7 @@ start_application_request(AppName, RestartType) ->
250250
%% Returns: boolean
251251
%%-----------------------------------------------------------------
252252
is_running(AppName) when is_atom(AppName) ->
253-
gen_server:call(?AC, {is_running, AppName}, infinity).
253+
call({is_running, AppName}, infinity).
254254

255255
%%-----------------------------------------------------------------
256256
%% Func: start_boot_application/2
@@ -263,9 +263,9 @@ start_boot_application(Application, RestartType) ->
263263
case {application:load(Application), RestartType} of
264264
{ok, _} ->
265265
AppName = get_appl_name(Application),
266-
gen_server:call(?AC, {start_application, AppName, RestartType}, infinity);
266+
call({start_application, AppName, RestartType}, infinity);
267267
{{error, {already_loaded, AppName}}, _} ->
268-
gen_server:call(?AC, {start_application, AppName, RestartType}, infinity);
268+
call({start_application, AppName, RestartType}, infinity);
269269
{{error,{bad_environment_value,Env}}, permanent} ->
270270
Txt = io_lib:format("Bad environment variable: ~tp Application: ~p",
271271
[Env, Application]),
@@ -275,15 +275,15 @@ start_boot_application(Application, RestartType) ->
275275
end.
276276

277277
stop_application(AppName) ->
278-
gen_server:call(?AC, {stop_application, AppName}, infinity).
278+
call({stop_application, AppName}, infinity).
279279

280280
%%-----------------------------------------------------------------
281281
%% Returns: [{Name, Descr, Vsn}]
282282
%%-----------------------------------------------------------------
283283
which_applications() ->
284-
gen_server:call(?AC, which_applications).
284+
call(which_applications).
285285
which_applications(Timeout) ->
286-
gen_server:call(?AC, which_applications, Timeout).
286+
call(which_applications, Timeout).
287287

288288
loaded_applications() ->
289289
ets:select(ac_tab,
@@ -295,10 +295,10 @@ loaded_applications() ->
295295

296296
%% Returns some debug info
297297
info() ->
298-
gen_server:call(?AC, info).
298+
call(info).
299299

300300
control_application(AppName) ->
301-
gen_server:call(?AC, {control_application, AppName}, infinity).
301+
call({control_application, AppName}, infinity).
302302

303303
%%-----------------------------------------------------------------
304304
%% Func: change_application_data/2
@@ -323,21 +323,14 @@ control_application(AppName) ->
323323
%% some applicatation may have got new config data.
324324
%%-----------------------------------------------------------------
325325
change_application_data(Applications, Config) ->
326-
gen_server:call(?AC,
327-
{change_application_data, Applications, Config},
328-
infinity).
326+
call({change_application_data, Applications, Config},infinity).
329327

330328
prep_config_change() ->
331-
gen_server:call(?AC,
332-
prep_config_change,
333-
infinity).
329+
call(prep_config_change, infinity).
334330

335331

336332
config_change(EnvPrev) ->
337-
gen_server:call(?AC,
338-
{config_change, EnvPrev},
339-
infinity).
340-
333+
call({config_change, EnvPrev},infinity).
341334

342335

343336
get_pid_env(Master, Key) ->
@@ -439,7 +432,7 @@ get_all_key(AppName) ->
439432
start_type(Master) ->
440433
case ets:match(ac_tab, {{application_master, '$1'}, Master}) of
441434
[[AppName]] ->
442-
gen_server:call(?AC, {start_type, AppName}, infinity);
435+
call({start_type, AppName}, infinity);
443436
_X ->
444437
undefined
445438
end.
@@ -475,31 +468,44 @@ get_application_module(_Module, []) ->
475468
undefined.
476469

477470
permit_application(ApplName, Flag) ->
478-
gen_server:call(?AC,
479-
{permit_application, ApplName, Flag},
480-
infinity).
471+
call({permit_application, ApplName, Flag},infinity).
481472

482473
set_env(Config, Opts) ->
483474
case check_conf_data(Config) of
484475
ok ->
485476
Timeout = proplists:get_value(timeout, Opts, 5000),
486-
gen_server:call(?AC, {set_env, Config, Opts}, Timeout);
477+
call({set_env, Config, Opts}, Timeout);
487478

488479
{error, _} = Error ->
489480
Error
490481
end.
491482

492483
set_env(AppName, Key, Val) ->
493-
gen_server:call(?AC, {set_env, AppName, Key, Val, []}).
484+
call({set_env, AppName, Key, Val, []}).
494485
set_env(AppName, Key, Val, Opts) ->
495486
Timeout = proplists:get_value(timeout, Opts, 5000),
496-
gen_server:call(?AC, {set_env, AppName, Key, Val, Opts}, Timeout).
487+
call({set_env, AppName, Key, Val, Opts}, Timeout).
497488

498489
unset_env(AppName, Key) ->
499-
gen_server:call(?AC, {unset_env, AppName, Key, []}).
490+
call({unset_env, AppName, Key, []}).
500491
unset_env(AppName, Key, Opts) ->
501492
Timeout = proplists:get_value(timeout, Opts, 5000),
502-
gen_server:call(?AC, {unset_env, AppName, Key, Opts}, Timeout).
493+
call({unset_env, AppName, Key, Opts}, Timeout).
494+
495+
call(Cmd) ->
496+
case gen_server:call(?AC, Cmd) of
497+
{error, terminating} ->
498+
exit(terminating);
499+
Res ->
500+
Res
501+
end.
502+
call(Cmd, Timeout) ->
503+
case gen_server:call(?AC, Cmd, Timeout) of
504+
{error, terminating} ->
505+
exit(terminating);
506+
Res ->
507+
Res
508+
end.
503509

504510
%%%-----------------------------------------------------------------
505511
%%% call-back functions from gen_server
@@ -1233,14 +1239,21 @@ terminate(Reason, S) ->
12331239
%% Proc died before link
12341240
{'EXIT', Id, _} -> ok
12351241
after 0 ->
1236-
receive
1237-
{'DOWN', Ref, process, Id, _} -> ok
1238-
after ShutdownTimeout ->
1239-
exit(Id, kill),
1240-
receive
1241-
{'DOWN', Ref, process, Id, _} -> ok
1242-
end
1243-
end
1242+
(fun F() ->
1243+
receive
1244+
{'DOWN', Ref, process, Id, _} -> ok;
1245+
%% We need to handle any gen_server:call here
1246+
%% and reply to them so that they don't deadlock
1247+
{'$gen_call', From, _Msg} ->
1248+
gen_server:reply(From, {error, terminating}),
1249+
F()
1250+
after ShutdownTimeout ->
1251+
exit(Id, kill),
1252+
receive
1253+
{'DOWN', Ref, process, Id, _} -> ok
1254+
end
1255+
end
1256+
end)()
12441257
end;
12451258
(_) -> ok
12461259
end,

lib/kernel/src/group.erl

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
%%
2020
-module(group).
2121

22+
-include_lib("kernel/include/logger.hrl").
23+
2224
%% A group leader process for user io.
2325
%% This process receives input data from user_drv in this format
2426
%% {Drv,{data,unicode:charlist()}}
@@ -1005,7 +1007,12 @@ save_line_buffer("\n", Lines) ->
10051007
save_line_buffer(Line, [Line|_Lines]=Lines) ->
10061008
save_line_buffer(Lines);
10071009
save_line_buffer(Line, Lines) ->
1008-
group_history:add(Line),
1010+
try
1011+
group_history:add(Line)
1012+
catch E:R:ST ->
1013+
?LOG_ERROR(#{ msg => "Failed to write to shell history",
1014+
error => {E, R, ST} })
1015+
end,
10091016
save_line_buffer([Line|Lines]).
10101017

10111018
save_line_buffer(Lines) ->

lib/kernel/src/group_history.erl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,10 @@ find_path() ->
374374
to_drop() ->
375375
case application:get_env(kernel, shell_history_drop) of
376376
undefined ->
377-
application:set_env(kernel, shell_history_drop, ?DEFAULT_DROP),
377+
%% The AC might be overloaded/not responding and
378+
%% we want the shell to be as responsive as possible
379+
%% so we set a short timeout
380+
application:set_env(kernel, shell_history_drop, ?DEFAULT_DROP, [{timeout, 10}]),
378381
?DEFAULT_DROP;
379382
{ok, V} when is_list(V) -> [Ln++"\n" || Ln <- V];
380383
{ok, _} -> ?DEFAULT_DROP

lib/kernel/src/kernel.app.src

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@
157157
{net_tickintensity, 4},
158158
{net_ticktime, 60},
159159
{prevent_overlapping_partitions, true},
160-
{shell_docs_ansi,auto}
160+
{shell_docs_ansi,auto},
161+
{shell_history_drop,[]}
161162
]},
162163
{mod, {kernel, []}},
163164
{runtime_dependencies, ["erts-14.0", "stdlib-5.0",

lib/kernel/test/application_SUITE.erl

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
-module(application_SUITE).
2121

2222
-include_lib("common_test/include/ct.hrl").
23+
-include_lib("stdlib/include/assert.hrl").
2324

2425
-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
2526
init_per_group/2,end_per_group/2
@@ -38,7 +39,8 @@
3839
-export([config_change/1, persistent_env/1, invalid_app_file/1,
3940
distr_changed_tc1/1, distr_changed_tc2/1,
4041
ensure_started/1, ensure_all_started/1,
41-
shutdown_func/1, do_shutdown/1, shutdown_timeout/1, shutdown_deadlock/1,
42+
shutdown_func/1, do_shutdown/1, shutdown_timeout/1,
43+
shutdown_application_call/1,shutdown_deadlock/1,
4244
config_relative_paths/1, handle_many_config_files/1,
4345
format_log_1/1, format_log_2/1,
4446
configfd_bash/1, configfd_port_program/1]).
@@ -60,7 +62,7 @@ all() ->
6062
permit_false_start_dist, get_key, get_env, ensure_all_started,
6163
set_env, set_env_persistent, set_env_errors, get_supervisor,
6264
{group, distr_changed}, config_change, shutdown_func, shutdown_timeout,
63-
shutdown_deadlock, config_relative_paths, optional_applications,
65+
shutdown_application_call, shutdown_deadlock, config_relative_paths, optional_applications,
6466
persistent_env, handle_many_config_files, format_log_1, format_log_2,
6567
configfd_bash, configfd_port_program, invalid_app_file].
6668

@@ -2576,6 +2578,60 @@ shutdown_timeout(Config) when is_list(Config) ->
25762578
end,
25772579
ok.
25782580

2581+
%%%-----------------------------------------------------------------
2582+
%%% Test that we do not cause a deadlock if we call
2583+
%%% application:set_env or application:ensure_started
2584+
%%% when terminating
2585+
%%%-----------------------------------------------------------------
2586+
shutdown_application_call(Config) when is_list(Config) ->
2587+
Tester = self(),
2588+
shutdown_application_call(
2589+
fun() ->
2590+
Tester ! {Tester,
2591+
catch application:set_env(
2592+
deadlock, a, b, [{timeout, infinity},
2593+
{persistent, true}])}
2594+
end, Config),
2595+
receive
2596+
{Tester, M} ->
2597+
?assertMatch({'EXIT',terminating}, M)
2598+
after 1000 ->
2599+
ct:fail("timeout 1 sec: no crash message found")
2600+
end,
2601+
2602+
shutdown_application_call(
2603+
fun() ->
2604+
Tester ! {Tester, catch application:ensure_started(runtime_tools)}
2605+
end, Config),
2606+
receive
2607+
{Tester, M2} ->
2608+
?assertMatch({'EXIT',terminating}, M2)
2609+
after 1000 ->
2610+
ct:fail("timeout 1 sec: no crash message found")
2611+
end.
2612+
2613+
shutdown_application_call(Fun, Config) ->
2614+
2615+
DataDir = proplists:get_value(data_dir,Config),
2616+
{ok,Cp1} = start_node(?MODULE_STRING++"_"++atom_to_list(?FUNCTION_NAME)),
2617+
wait_for_ready_net(),
2618+
rpc:call(Cp1, code, add_path, [filename:join([DataDir,deadlock])]),
2619+
rpc:call(Cp1, code, add_path, [filename:dirname(code:which(?MODULE))]),
2620+
ok = rpc:call(Cp1, application, start, [sasl]),
2621+
2622+
ok = rpc:call(Cp1, application, start, [deadlock]),
2623+
rpc:call(Cp1, application, set_env, [deadlock, fail_stop, Fun]),
2624+
2625+
ok = net_kernel:monitor_nodes(true),
2626+
_ = rpc:call(Cp1, init, stop, []),
2627+
receive
2628+
{nodedown,Cp1} ->
2629+
ok
2630+
after 10000 ->
2631+
ct:fail("timeout 10 sec: node termination hangs")
2632+
end,
2633+
ok.
2634+
25792635
%%%-----------------------------------------------------------------
25802636
%%% Provokes a (previous) application shutdown deadlock
25812637
%%%-----------------------------------------------------------------

lib/kernel/test/application_SUITE_data/deadlock/deadlock.erl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ handle_info(_Msg, State) ->
5454
terminate(_Reason, _State) ->
5555
case application:get_env(deadlock, fail_stop) of
5656
{ok, false} -> ok;
57-
{ok, Tester} ->
57+
{ok, Fun} when is_function(Fun, 0) ->
58+
Fun();
59+
{ok, Tester} when is_pid(Tester) ->
5860
Tester ! {deadlock, self()},
5961
io:format("~p: Waiting in terminate (~p)~n",[?MODULE,Tester]),
6062
receive continue -> ok end

lib/sasl/test/release_handler_SUITE.erl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2388,7 +2388,10 @@ wait_nodes_up(Nodes, Tag, Apps, N) ->
23882388
fun(NodeInfo={Node,OldInitPid}, A) ->
23892389
case rpc:call(Node, application, which_applications, []) of
23902390
{badrpc, nodedown} ->
2391-
test_server:format( " ~p = {badarg, nodedown}",[Node]),
2391+
test_server:format( " ~p = {badrpc, nodedown}",[Node]),
2392+
[NodeInfo | A];
2393+
{badrpc, {'EXIT',terminating}} ->
2394+
test_server:format( " ~p = {badrpc, {'EXIT',terminating}}",[Node]),
23922395
[NodeInfo | A];
23932396
List when is_list(List)->
23942397
test_server:format( " ~p = [~p]",[Node, List]),
@@ -2406,7 +2409,7 @@ wait_nodes_up(Nodes, Tag, Apps, N) ->
24062409
false ->
24072410
[NodeInfo | A]
24082411
end
2409-
end
2412+
end
24102413
end,
24112414
Pang = lists:foldl(Fun,[],Nodes),
24122415
case Pang of

0 commit comments

Comments
 (0)